## Topic Modeling: zero-shot classification

In [1]:
# --------------------
# Import pipeline
# --------------------
from transformers import pipeline
import pandas as pd

text1 = ["this a non sentence. I am not sure what to do with it.", "I am happy to be here"]


# zero shot clasification for text1
zero_shot_pipeline = pipeline(task="zero-shot-classification", model = "facebook/bart-large-mnli")    

candidate_labels = ["sad", "happy", "angry"]

zeroshot = zero_shot_pipeline(text1, candidate_labels)

# ----------------------------------------------------------------------------------------------------
# DF including the text, labels and scores for each label
# ----------------------------------------------------------------------------------------------------
df_4 = pd.DataFrame(zeroshot)
df_4["text"] = text1
df_4




  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForSequenceClassification: ['model.decoder.version', 'model.encoder.version']
- This IS expected if you are initializing TFBartForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForSequenceClassification for predictions without further training.


Unnamed: 0,sequence,labels,scores,text
0,this a non sentence. I am not sure what to do ...,"[sad, happy, angry]","[0.5717552304267883, 0.2215738296508789, 0.206...",this a non sentence. I am not sure what to do ...
1,I am happy to be here,"[happy, angry, sad]","[0.9985308051109314, 0.0007497738115489483, 0....",I am happy to be here


In [2]:
# convert labels to separate columns and fill in the scores as values. 

df_4 = df_4.set_index("text")
df_4 = df_4["scores"].apply(pd.Series)
df_4.reset_index(inplace=True)

# label columns
df_4.columns = ["text", "sad", "happy", "angry"]
df_4

Unnamed: 0,text,sad,happy,angry
0,this a non sentence. I am not sure what to do ...,0.571755,0.221574,0.206671
1,I am happy to be here,0.998531,0.00075,0.000719


## Use zero-shot learning on the reviews sample and test the accuracy of the predicted labels

In [3]:
ds = pd.read_csv('/Users/aishwaryaravichandran/Documents/nlp/nlp_assignment_0_group_work/CS6120/Assignments/A3/review_set.csv')
subset = ds.head(1000).copy()
subset.shape

(1000, 9)

In [5]:
result_10 = subset.head(10)

zeroshot_scores = []

for review in result_10['review']:
    # Perform zero shot classification using the pipeline
    zeroshot = zero_shot_pipeline(review[0:501], candidate_labels)
    zeroshot_scores.append(zeroshot)

zs_analysis = pd.DataFrame(zeroshot_scores)
zs_analysis["text"] = result_10['review']
zs_analysis

Unnamed: 0,sequence,labels,scores,text
0,buyer beware item resold amazons warehouse arr...,"[sad, happy, angry]","[0.5839831233024597, 0.23054248094558716, 0.18...",buyer beware item resold amazons warehouse arr...
1,worth buy multiplayer constantly crashes beat ...,"[sad, angry, happy]","[0.7449404001235962, 0.2120196521282196, 0.043...",worth buy multiplayer constantly crashes beat ...
2,he fine its going anything amazing quality wis...,"[happy, angry, sad]","[0.9615718722343445, 0.026054298505187035, 0.0...",he fine its going anything amazing quality wis...
3,meant be anyone claiming unhappy remake must h...,"[angry, sad, happy]","[0.853062093257904, 0.13765177130699158, 0.009...",meant be anyone claiming unhappy remake must h...
4,looks feels good it like mouse feels moves bat...,"[happy, sad, angry]","[0.9850643277168274, 0.009887082502245903, 0.0...",looks feels good it like mouse feels moves bat...
5,optimistic within first ten minutes leaving va...,"[angry, happy, sad]","[0.4990401864051819, 0.26041659712791443, 0.24...",optimistic within first ten minutes leaving va...
6,junk get junk get money back option would trad...,"[sad, angry, happy]","[0.5291018486022949, 0.24054089188575745, 0.23...",junk get junk get money back option would trad...
7,work greatest the controllers job time sometim...,"[happy, sad, angry]","[0.9243314266204834, 0.038793887943029404, 0.0...",work greatest the controllers job time sometim...
8,awesomely husband loves games great quality co...,"[happy, angry, sad]","[0.4710567891597748, 0.4574005603790283, 0.071...",awesomely husband loves games great quality co...
9,fun whole different approach last world super ...,"[happy, sad, angry]","[0.9881453514099121, 0.006623686291277409, 0.0...",fun whole different approach last world super ...


In [6]:
zs_analysis = zs_analysis.set_index("text")
zs_analysis = zs_analysis["scores"].apply(pd.Series)
zs_analysis.reset_index(inplace=True)

# label columns
zs_analysis.columns = ["text", "sad", "happy", "angry"]
zs_analysis

Unnamed: 0,text,sad,happy,angry
0,buyer beware item resold amazons warehouse arr...,0.583983,0.230542,0.185474
1,worth buy multiplayer constantly crashes beat ...,0.74494,0.21202,0.04304
2,he fine its going anything amazing quality wis...,0.961572,0.026054,0.012374
3,meant be anyone claiming unhappy remake must h...,0.853062,0.137652,0.009286
4,looks feels good it like mouse feels moves bat...,0.985064,0.009887,0.005049
5,optimistic within first ten minutes leaving va...,0.49904,0.260417,0.240543
6,junk get junk get money back option would trad...,0.529102,0.240541,0.230357
7,work greatest the controllers job time sometim...,0.924331,0.038794,0.036875
8,awesomely husband loves games great quality co...,0.471057,0.457401,0.071543
9,fun whole different approach last world super ...,0.988145,0.006624,0.005231
