# MNLI

In [12]:
from unsupervised_absa.mnli import MnliPipeline
model = MnliPipeline('microsoft/deberta-large-mnli')

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Laptop

In [13]:
import json
with open('../data_2015/pos tag/laptop_pos_tag.json') as f:
    pos_tags = json.load(f)

In [14]:
import pandas as pd
pos_tag_df = pd.DataFrame.from_dict(pos_tags)

In [15]:
# expanding the df to sentence to 1 pos tag df
from tqdm import tqdm
import pandas as pd
new_df = {'text': [], 'sid': [], 'aspectLabel': [], 'term ground truth': [], 'category ground truth': []}
for index, row in tqdm(pos_tag_df.iterrows()):
    if len(row['pos_tag']) != 0:
        text = row['text']
        sid = row['sentenceId']
        term_ground_truth = row['term ground truth']
        category_ground_truth = row['term ground truth']
        for aspect in row['pos_tag']:
            new_df['aspectLabel'].append(aspect['word'])
            new_df['text'].append(text)
            new_df['sid'].append(sid)
            new_df['term ground truth'].append(term_ground_truth)
            new_df['category ground truth'].append(category_ground_truth)

mnli_df = pd.DataFrame(new_df)

1735it [00:00, 19261.16it/s]


In [23]:
dataset = model.extract_polarity(mnli_df, 'text', 'aspectLabel', device='cuda', batch_size=8)

[32m2023-04-23 20:14:27.443[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m145[0m - [1mPreprocessing dataset with length: 3972[0m
[32m2023-04-23 20:14:27.472[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m149[0m - [1mExtracting polarity with model: microsoft/deberta-large-mnli[0m
100%|██████████| 3972/3972 [47:03<00:00,  1.41it/s] 
[32m2023-04-23 21:01:30.931[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m171[0m - [1mPostprocessing outputs[0m


In [24]:
dataset.save_to_disk('../data_2015/mnli/laptop_aspect_term')

Saving the dataset (0/1 shards):   0%|          | 0/3972 [00:00<?, ? examples/s]

#### Words Count > 1

In [25]:
import json
with open('../data_2015/pos tag/laptop_pos_tag_low_counts.json') as f:
    pos_tags = json.load(f)

In [26]:
import pandas as pd
pos_tag_df = pd.DataFrame.from_dict(pos_tags)

In [27]:
# expanding the df to sentence to 1 pos tag df
from tqdm import tqdm
import pandas as pd
new_df = {'text': [], 'sid': [], 'aspectLabel': [], 'term ground truth': [], 'category ground truth': []}
for index, row in tqdm(pos_tag_df.iterrows()):
    if len(row['pos_tag']) != 0:
        text = row['text']
        sid = row['sentenceId']
        term_ground_truth = row['term ground truth']
        category_ground_truth = row['term ground truth']
        for aspect in row['pos_tag']:
            new_df['aspectLabel'].append(aspect['word'])
            new_df['text'].append(text)
            new_df['sid'].append(sid)
            new_df['term ground truth'].append(term_ground_truth)
            new_df['category ground truth'].append(category_ground_truth)

mnli_df = pd.DataFrame(new_df)

1735it [00:00, 18639.92it/s]


In [28]:
low_counts_dataset = model.extract_polarity(mnli_df, 'text', 'aspectLabel', device='cuda', batch_size=8)

[32m2023-04-23 21:01:31.660[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m145[0m - [1mPreprocessing dataset with length: 3060[0m
[32m2023-04-23 21:01:31.678[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m149[0m - [1mExtracting polarity with model: microsoft/deberta-large-mnli[0m
100%|██████████| 3060/3060 [35:51<00:00,  1.42it/s] 
[32m2023-04-23 21:37:23.604[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m171[0m - [1mPostprocessing outputs[0m


In [29]:
low_counts_dataset.save_to_disk('../data_2015/mnli/laptop_aspect_term_low_counts')

Saving the dataset (0/1 shards):   0%|          | 0/3060 [00:00<?, ? examples/s]

### Restaurant

In [30]:
import json
with open('../data_2015/pos tag/restaurant_pos_tag.json') as f:
    pos_tags = json.load(f)

In [31]:
import pandas as pd
pos_tag_df = pd.DataFrame.from_dict(pos_tags)

In [32]:
# expanding the df to sentence to 1 pos tag df
from tqdm import tqdm
import pandas as pd
new_df = {'text': [], 'sid': [], 'aspectLabel': [], 'term ground truth': [], 'category ground truth': []}
for index, row in tqdm(pos_tag_df.iterrows()):
    if len(row['pos_tag']) != 0:
        text = row['text']
        sid = row['sentenceId']
        term_ground_truth = row['term ground truth']
        category_ground_truth = row['term ground truth']
        for aspect in row['pos_tag']:
            new_df['aspectLabel'].append(aspect['word'])
            new_df['text'].append(text)
            new_df['sid'].append(sid)
            new_df['term ground truth'].append(term_ground_truth)
            new_df['category ground truth'].append(category_ground_truth)

mnli_df = pd.DataFrame(new_df)

1311it [00:00, 11962.32it/s]


In [33]:
dataset = model.extract_polarity(mnli_df, 'text', 'aspectLabel', device='cuda', batch_size=8)

[32m2023-04-23 21:37:24.167[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m145[0m - [1mPreprocessing dataset with length: 2755[0m
[32m2023-04-23 21:37:24.189[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m149[0m - [1mExtracting polarity with model: microsoft/deberta-large-mnli[0m
100%|██████████| 2755/2755 [30:23<00:00,  1.51it/s]
[32m2023-04-23 22:07:47.865[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m171[0m - [1mPostprocessing outputs[0m


In [34]:
dataset.save_to_disk('../data_2015/mnli/restaurant_aspect_term')

Saving the dataset (0/1 shards):   0%|          | 0/2755 [00:00<?, ? examples/s]

#### Words Count > 1

In [35]:
import json
with open('../data_2015/pos tag/restaurant_pos_tag_low_counts.json') as f:
    pos_tags = json.load(f)

In [36]:
import pandas as pd
pos_tag_df = pd.DataFrame.from_dict(pos_tags)

In [37]:
# expanding the df to sentence to 1 pos tag df
from tqdm import tqdm
import pandas as pd
new_df = {'text': [], 'sid': [], 'aspectLabel': [], 'term ground truth': [], 'category ground truth': []}
for index, row in tqdm(pos_tag_df.iterrows()):
    if len(row['pos_tag']) != 0:
        text = row['text']
        sid = row['sentenceId']
        term_ground_truth = row['term ground truth']
        category_ground_truth = row['term ground truth']
        for aspect in row['pos_tag']:
            new_df['aspectLabel'].append(aspect['word'])
            new_df['text'].append(text)
            new_df['sid'].append(sid)
            new_df['term ground truth'].append(term_ground_truth)
            new_df['category ground truth'].append(category_ground_truth)

mnli_df = pd.DataFrame(new_df)

1311it [00:00, 14717.80it/s]


In [38]:
low_counts_dataset = model.extract_polarity(mnli_df, 'text', 'aspectLabel', device='cuda', batch_size=8)

[32m2023-04-23 22:07:48.658[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m145[0m - [1mPreprocessing dataset with length: 2047[0m
[32m2023-04-23 22:07:48.676[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m149[0m - [1mExtracting polarity with model: microsoft/deberta-large-mnli[0m
100%|██████████| 2047/2047 [22:58<00:00,  1.48it/s]
[32m2023-04-23 22:30:47.233[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m171[0m - [1mPostprocessing outputs[0m


In [39]:
low_counts_dataset.save_to_disk('../data_2015/mnli/restaurant_aspect_term_low_counts')

Saving the dataset (0/1 shards):   0%|          | 0/2047 [00:00<?, ? examples/s]