In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 30.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 67.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
# Data processing
import pandas as pd

# Modeling
from transformers import pipeline
classifier = pipeline(task="zero-shot-classification",
                      model="facebook/bart-large-mnli",
                      device=0)

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

dataset link:
 https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Change directory
import os
os.chdir("drive/My Drive/contents/nlp")

# Print out the current directory
!pwd

Mounted at /content/drive
/content/drive/My Drive/contents/nlp


In [None]:
# Read in data
amz_review = pd.read_csv('sentiment labelled sentences/amazon_cells_labelled.txt', sep='\t', names=['review', 'label'])

# Drop te label
amz_review = amz_review.drop('label', axis=1);

# Take a look at the data
amz_review.head()

Unnamed: 0,review
0,So there is no way for me to plug it in here i...
1,"Good case, Excellent value."
2,Great for the jawbone.
3,Tied to charger for conversations lasting more...
4,The mic is great.


In [None]:
# Get the dataset information
amz_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1000 non-null   object
dtypes: object(1)
memory usage: 7.9+ KB


In [None]:
# Put reviews in a list
sequences = amz_review['review'].to_list()

# Define the candidate labels
candidate_labels = ["sound quality", "battery", "price", "comfortable"]

# Set the hyppothesis template
hypothesis_template = "The topic of this review is {}."

# Prediction results
single_topic_prediction = classifier(sequences, candidate_labels, hypothesis_template=hypothesis_template)

# Save the output as a dataframe
single_topic_prediction = pd.DataFrame(single_topic_prediction)

# Take a look at the data
single_topic_prediction.head()

Unnamed: 0,sequence,labels,scores
0,So there is no way for me to plug it in here i...,"[battery, comfortable, sound quality, price]","[0.3193794786930084, 0.272741436958313, 0.2234..."
1,"Good case, Excellent value.","[sound quality, price, comfortable, battery]","[0.39462438225746155, 0.3236585855484009, 0.27..."
2,Great for the jawbone.,"[comfortable, sound quality, battery, price]","[0.76302570104599, 0.13184957206249237, 0.0590..."
3,Tied to charger for conversations lasting more...,"[battery, sound quality, price, comfortable]","[0.45327743887901306, 0.3156174421310425, 0.14..."
4,The mic is great.,"[sound quality, comfortable, price, battery]","[0.7811059951782227, 0.2074737697839737, 0.005..."


In [None]:
# Tune the batch_size to fit in the memory
batch_size = 4

# Put reviews in a list
sequences = amz_review['review'].to_list()

# Define the candidate labels
candidate_labels = ["sound quality", "battery", "price", "comfortable"]

# Set the hyppothesis template
hypothesis_template = "The topic of this review is {}."

# Create an empty list to save the prediciton results
single_topic_prediction = []

# Loop through the batches
for i in range(0, len(sequences), batch_size):
    # Append the results
    single_topic_prediction += classifier(sequences[i:i+batch_size], candidate_labels, hypothesis_template=hypothesis_template)




In [None]:
# The column for the predicted topic
single_topic_prediction['predicted_topic'] = single_topic_prediction['labels'].apply(lambda x: x[0])

# The column for the score of predi ted topic
single_topic_prediction['predicted_topic_score'] = single_topic_prediction['scores'].apply(lambda x: x[0])

# Take a look at the data
single_topic_prediction.head()

Unnamed: 0,sequence,labels,scores,predicted_topic,predicted_topic_score
0,So there is no way for me to plug it in here i...,"[battery, comfortable, sound quality, price]","[0.3193794786930084, 0.272741436958313, 0.2234...",battery,0.319379
1,"Good case, Excellent value.","[sound quality, price, comfortable, battery]","[0.39462438225746155, 0.3236585855484009, 0.27...",sound quality,0.394624
2,Great for the jawbone.,"[comfortable, sound quality, battery, price]","[0.76302570104599, 0.13184957206249237, 0.0590...",comfortable,0.763026
3,Tied to charger for conversations lasting more...,"[battery, sound quality, price, comfortable]","[0.45327743887901306, 0.3156174421310425, 0.14...",battery,0.453277
4,The mic is great.,"[sound quality, comfortable, price, battery]","[0.7811059951782227, 0.2074737697839737, 0.005...",sound quality,0.781106


In [None]:
# Put reviews in a list
sequences = amz_review['review'].to_list()

# Define the candidate labels
candidate_labels = ["sound quality", "battery", "price", "comfortable"]

# Set the hyppothesis template
hypothesis_template = "The topic of this review is {}."

# Prediction results
multi_topic_prediction = classifier(sequences, candidate_labels, hypothesis_template=hypothesis_template, multi_label=True)

# Save the output in a dataframe
multi_topic_prediction = pd.DataFrame(multi_topic_prediction)

# Take a look at the data
multi_topic_prediction.head()

Unnamed: 0,sequence,labels,scores
0,So there is no way for me to plug it in here i...,"[battery, comfortable, sound quality, price]","[0.049397144466638565, 0.036474138498306274, 0..."
1,"Good case, Excellent value.","[price, sound quality, comfortable, battery]","[0.9905472993850708, 0.9873887896537781, 0.982..."
2,Great for the jawbone.,"[comfortable, sound quality, battery, price]","[0.8336957097053528, 0.01715901680290699, 0.00..."
3,Tied to charger for conversations lasting more...,"[battery, sound quality, price, comfortable]","[0.16772010922431946, 0.05891984701156616, 0.0..."
4,The mic is great.,"[sound quality, comfortable, price, battery]","[0.9856133460998535, 0.8641756176948547, 0.000..."


In [None]:
# Threshold probability
threshold = 0.6

# Expand the lists
multi_topic_prediction = multi_topic_prediction.set_index('sequence').apply(pd.Series.explode).reset_index()

# Filter by threshold
multi_topic_prediction = multi_topic_prediction[multi_topic_prediction['scores'] >= threshold]

# Take a look at the data
multi_topic_prediction.head()

Unnamed: 0,sequence,labels,scores
0,"Good case, Excellent value.",price,0.990547
1,"Good case, Excellent value.",sound quality,0.987389
2,"Good case, Excellent value.",comfortable,0.982812
3,Great for the jawbone.,comfortable,0.833696
4,The mic is great.,sound quality,0.985613
