# MNLI

In [6]:
from unsupervised_absa.mnli import MnliPipeline
model = MnliPipeline('microsoft/deberta-large-mnli')

Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import json
with open('data/pos tag/semeval_pos_tag_remove_short_words.json') as f:
    pos_tags = json.load(f)

In [3]:
import pandas as pd
pos_tag_df = pd.DataFrame.from_dict(pos_tags)

In [4]:
# expanding the df to sentence to 1 pos tag df
from tqdm import tqdm
import pandas as pd
new_df = {'text': [], 'sid': [], 'aspectLabel': []}
for index, row in tqdm(pos_tag_df.iterrows()):
    if len(row['pos_tag']) != 0:
        text = row['text']
        sid = row['sentenceId']
        for aspect in row['pos_tag']:
            new_df['aspectLabel'].append(aspect['word'])
            new_df['text'].append(text)
            new_df['sid'].append(sid)

mnli_df = pd.DataFrame(new_df)

6055it [00:00, 13166.08it/s]


In [5]:
dataset = model.extract_polarity(mnli_df, 'text', 'aspectLabel', device='cuda', batch_size=64)

[32m2023-04-13 19:28:48.837[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m145[0m - [1mPreprocessing dataset with length: 14179[0m
[32m2023-04-13 19:28:48.889[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m149[0m - [1mExtracting polarity with model: microsoft/deberta-large-mnli[0m
100%|██████████| 14179/14179 [3:35:59<00:00,  1.09it/s] 
[32m2023-04-13 23:04:48.568[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m171[0m - [1mPostprocessing outputs[0m


In [10]:
dataset.save_to_disk('data/mnli/absa_aspect_term')

Saving the dataset (0/1 shards):   0%|          | 0/14179 [00:00<?, ? examples/s]

### Words Count > 1

In [2]:
import json
with open('data/pos tag/semeval_pos_tag_remove_short_words_and_low_counts.json') as f:
    pos_tags = json.load(f)

In [3]:
import pandas as pd
pos_tag_df = pd.DataFrame.from_dict(pos_tags)

In [4]:
# expanding the df to sentence to 1 pos tag df
from tqdm import tqdm
import pandas as pd
new_df = {'text': [], 'sid': [], 'aspectLabel': []}
for index, row in tqdm(pos_tag_df.iterrows()):
    if len(row['pos_tag']) != 0:
        text = row['text']
        sid = row['sentenceId']
        for aspect in row['pos_tag']:
            new_df['aspectLabel'].append(aspect['word'])
            new_df['text'].append(text)
            new_df['sid'].append(sid)

mnli_df = pd.DataFrame(new_df)

6055it [00:00, 21227.39it/s]


In [7]:
low_counts_dataset = model.extract_polarity(mnli_df, 'text', 'aspectLabel', device='cuda', batch_size=128)

[32m2023-04-14 00:19:59.817[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m145[0m - [1mPreprocessing dataset with length: 11666[0m
[32m2023-04-14 00:19:59.841[0m | [1mINFO    [0m | [36munsupervised_absa.mnli[0m:[36mextract_polarity[0m:[36m149[0m - [1mExtracting polarity with model: microsoft/deberta-large-mnli[0m
  4%|▍         | 513/11666 [07:05<2:28:46,  1.25it/s]

In [None]:
low_counts_dataset.save_to_disk('data/mnli/absa_aspect_term_filtered')