In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install numpy==1.23.5 scipy==1.10.1 tensorflow==2.15.0 keras==2.15

In [None]:
# Essential libraries
import numpy as np
import pandas as pd
import os
import re
import bz2
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# Visualization libraries
#import scikitplot as skplt
import plotly.graph_objects as go
from wordcloud import WordCloud

# NLTK for text processing and sentiment analysis
import nltk
nltk.download('all')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# TensorFlow and Keras for model building
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import (
    LSTM, Bidirectional, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D,
    Flatten, Dropout, Dense, Embedding, SpatialDropout1D, concatenate
)
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Snorkel for weak supervision
!pip install snorkel
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel, MajorityLabelVoter

# Scikit-multilearn for multi-label classification
!pip install scikit-multilearn
from skmultilearn.adapt import MLkNN

# Scikit-learn for evaluation metrics and utilities
from sklearn.metrics import (
    hamming_loss, accuracy_score, precision_score, recall_score, f1_score,
    classification_report, multilabel_confusion_matrix, confusion_matrix
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Set up your environment
import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

Collecting snorkel
  Downloading snorkel-0.9.9-py3-none-any.whl.metadata (9.7 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading snorkel-0.9.9-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.9.9
Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl.metadata (6.0 kB)
Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [None]:
os.chdir("/content/drive/MyDrive/Proj_dataset/archive")
files = os.listdir()
print(files)

['glove.6B.100d.txt', 'Reviews.csv', 'test.ft.txt.bz2', 'train.ft.txt.bz2', 'unlabelled data.csv', 'refined_test_data.csv', 'refined_train_data.csv']


In [None]:
data=bz2.BZ2File("train.ft.txt.bz2")
data = bz2.BZ2File("train.ft.txt.bz2", "r")
data = data.readlines()
len(data)
data=[x.decode('utf-8') for x in data]
data[1:15]

["__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n",
 '__label__2 Amazing!: This soundtrack is my favorite music of all time, hands down. The intense sadness of "Prisoners of Fate" (which means all the more if you\'ve played the game) and the hope in "A Distant Promise" and "Girl who Stole the Star" have been an important inspiration to me personally throughout my teen years. The higher energy tracks like "Chrono Cross ~ Time\'s Scar~", "Time of the Dreamwatch", and "Chronomantique" (indefinably remeniscent of Chrono Tri

In [None]:
labels=[0 if x.split(' ')[0]=="__label__1" else 1 for x in data][:500000]
review_original=[ x.split(' ',1)[1][:-1].lower() for x in data]
review_text=[ x.split(' ',1)[1][:-1].lower() for x in data]
data_zipped=pd.DataFrame(list(zip(labels,review_text)),columns=["Rating","Review_Text"])
data_zipped

Unnamed: 0,Rating,Review_Text
0,1,stuning even for the non-gamer: this sound tra...
1,1,the best soundtrack ever to anything.: i'm rea...
2,1,amazing!: this soundtrack is my favorite music...
3,1,excellent soundtrack: i truly like this soundt...
4,1,"remember, pull your jaw off the floor after he..."
...,...,...
499995,0,prepare for offence: ridley's conjecture and s...
499996,1,this is a great song but buy the album.: trust...
499997,0,what happen to fram gas filter's quality ???: ...
499998,0,no filter on 2003 hyundai xg350l!!!: there is ...


In [None]:
data_filtered = data_zipped[data_zipped['Review_Text'].str.contains('product')]
data_filtered=data_filtered[~data_filtered['Review_Text'].str.contains('music|game|soundtrack|cd|song|book|album|story|film|dvd|disc|narrative|movie|track|henry|theatre|record|author|reading|writing|read|character|video|novel|travel|dishes|rap|beat|c.d.|essay|studio|watched|star|comedy|comedian|hollywood|thriller|audience|tv|series|economics|government|chapter|narrating|thomas|cast|script|stories|cinimatography|show|published|stream|reader|kindle|reading|drama|literary|adam|sandler|episodes|watch|recording|charlie brown|stars|charlie brown|show|watched|dialogue|act|voice|singer|band|microphone|bands|season|acted|artist|voice|listeners|lyrics|scene|hits|rockers|fantacy|news|plot|twist|intellectual|james brown|brown|narratione')]
len(data_filtered)

15536

In [None]:
os.chdir("/content/drive/MyDrive/Proj_dataset")
files = os.listdir()
print(files)

['base_paper1.pdf', 'archive', 'results', 'new_senti_train_data.csv', 'amazon_tagged_data.csv']


In [None]:
label_data = pd.read_csv("amazon_tagged_data.csv",encoding = 'cp1252')
len(label_data)
#col = ['Aspect_Sentiment']
#label_data.drop(columns=col, inplace=True)
label_data.to_csv('amazon_tagged_data.csv', index=False)
label_data.head()

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment
0,1,"I am loving the size of this shirt ,fits me well",,,"size,fits",,,"size,fits,",POSITIVE
1,0,"The quality of this toy is very bad,it is broken",,,,,"quality,broken","quality,broken",NEGATIVE
2,0,"The manual instructions are very bad ,wish to ...",,,,"manual,instructions,contact,seller",,"manual,instructions,contact,seller",NEGATIVE
3,1,"The cost is very less for this chair ,only 10$",,"cost,",,,,"cost,$",POSITIVE
4,0,This chair is very useful and functions very well,"useful,functions",,,,,"useful,functions",MIXED


In [None]:
label_data['Aspect_Terms']=label_data['Aspect_Terms'].str.replace("nan,","")
label_data['Aspect_Terms']=label_data['Aspect_Terms'].str.replace("nan","")
for col in label_data.columns:
 label_data[col].fillna("NotFound",inplace=True)
label_data.head(5)

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment
0,1,"I am loving the size of this shirt ,fits me well",NotFound,NotFound,"size,fits",NotFound,NotFound,"size,fits,",POSITIVE
1,0,"The quality of this toy is very bad,it is broken",NotFound,NotFound,NotFound,NotFound,"quality,broken","quality,broken",NEGATIVE
2,0,"The manual instructions are very bad ,wish to ...",NotFound,NotFound,NotFound,"manual,instructions,contact,seller",NotFound,"manual,instructions,contact,seller",NEGATIVE
3,1,"The cost is very less for this chair ,only 10$",NotFound,"cost,",NotFound,NotFound,NotFound,"cost,$",POSITIVE
4,0,This chair is very useful and functions very well,"useful,functions",NotFound,NotFound,NotFound,NotFound,"useful,functions",MIXED


In [None]:
def remove_duplicate(x):
  return list(dict.fromkeys(x))

usability_aspect_terms=','.join([i.strip() for i in label_data['Usability'] if i != "NotFound"]).split(',')
usability_aspect_terms=remove_duplicate([x.strip() for x in usability_aspect_terms if x!=''])

price_aspect_terms=','.join([i.strip() for i in label_data['Price'] if i != "NotFound"]).split(',')
price_aspect_terms=remove_duplicate([x.strip() for x in price_aspect_terms if x!=''])

size_aspect_terms=','.join([i.strip() for i in label_data['Size'] if i != "NotFound"]).split(',')
size_aspect_terms=remove_duplicate([x.strip() for x in size_aspect_terms if x!=''])

service_aspect_terms=','.join([i.strip() for i in label_data['Service'] if i != "NotFound"]).split(',')
service_aspect_terms=remove_duplicate([x.strip() for x in service_aspect_terms if x!=''])

quality_Aspect_terms=','.join([i.strip() for i in label_data['Quality'] if i != "NotFound"]).split(',')
quality_Aspect_terms=remove_duplicate([x.strip() for x in quality_Aspect_terms if x!=''])
size_aspect_terms

['size',
 'fits',
 'heavy',
 'sizes',
 'chart',
 'smaller',
 'large',
 'feet',
 'big',
 'fit',
 'longer',
 'small',
 'tiny',
 'width',
 'thin',
 'taller',
 'tight',
 'small inch',
 'skinny',
 'hefty',
 'long',
 'xl',
 'length',
 'ft',
 'inches',
 'measurement',
 'streched',
 'medium',
 'xlarge',
 'sized',
 'smaller size',
 'pound',
 'tall',
 'tightness',
 'bulky',
 'sizing',
 'measure',
 'shorter',
 'short',
 'tighter',
 'inch',
 'size chart',
 'xs',
 'high',
 'measured',
 'stouter',
 'wider',
 'x-large',
 'mediums',
 'bigger',
 'foot',
 'lower',
 'height',
 'lowering',
 'fitted',
 'higher',
 'lowered',
 'larger']

In [None]:
label_data.head(5)

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment
0,1,"I am loving the size of this shirt ,fits me well",NotFound,NotFound,"size,fits",NotFound,NotFound,"size,fits,",POSITIVE
1,0,"The quality of this toy is very bad,it is broken",NotFound,NotFound,NotFound,NotFound,"quality,broken","quality,broken",NEGATIVE
2,0,"The manual instructions are very bad ,wish to ...",NotFound,NotFound,NotFound,"manual,instructions,contact,seller",NotFound,"manual,instructions,contact,seller",NEGATIVE
3,1,"The cost is very less for this chair ,only 10$",NotFound,"cost,",NotFound,NotFound,NotFound,"cost,$",POSITIVE
4,0,This chair is very useful and functions very well,"useful,functions",NotFound,NotFound,NotFound,NotFound,"useful,functions",MIXED


In [None]:
aspects=[]
for i in range(0,len(label_data)):
  aspect_values=[]
  invalid_data=0
  for col in ['Usability','Price','Size','Service','Quality']:
    if label_data.loc[i,col] != "NotFound":
      aspect_values.append(col)
      invalid_data+=1
  if(invalid_data==0):
    print("the row %d is insufficient data",i)
  else:
    aspect_values.sort()
    asp=','.join(aspect_values)
    aspects.append(asp)
print(aspects)
label_data['Aspect_Category']=aspects
label_data.head(5)

['Size', 'Quality', 'Service', 'Price', 'Usability', 'Quality,Size', 'Service,Size,Usability', 'Price,Usability', 'Price,Quality,Service,Size', 'Price,Quality,Size', 'Price,Usability', 'Usability', 'Size', 'Price,Quality,Usability', 'Size,Usability', 'Size,Usability', 'Service,Usability', 'Quality', 'Usability', 'Price', 'Usability', 'Service,Size', 'Quality', 'Price,Quality,Service', 'Quality,Service,Size', 'Usability', 'Usability', 'Quality', 'Quality', 'Size,Usability', 'Price,Service', 'Service', 'Quality,Service', 'Usability', 'Quality,Size,Usability', 'Service,Size', 'Usability', 'Service', 'Size,Usability', 'Usability', 'Price,Usability', 'Size', 'Service', 'Service', 'Service', 'Quality,Service', 'Usability', 'Quality,Usability', 'Usability', 'Price,Usability', 'Quality,Usability', 'Price,Usability', 'Quality', 'Quality,Usability', 'Usability', 'Price,Service', 'Price', 'Price,Quality,Size,Usability', 'Price,Quality', 'Quality,Usability', 'Size,Usability', 'Size,Usability', 'Pr

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment,Aspect_Category
0,1,"I am loving the size of this shirt ,fits me well",NotFound,NotFound,"size,fits",NotFound,NotFound,"size,fits,",POSITIVE,Size
1,0,"The quality of this toy is very bad,it is broken",NotFound,NotFound,NotFound,NotFound,"quality,broken","quality,broken",NEGATIVE,Quality
2,0,"The manual instructions are very bad ,wish to ...",NotFound,NotFound,NotFound,"manual,instructions,contact,seller",NotFound,"manual,instructions,contact,seller",NEGATIVE,Service
3,1,"The cost is very less for this chair ,only 10$",NotFound,"cost,",NotFound,NotFound,NotFound,"cost,$",POSITIVE,Price
4,0,This chair is very useful and functions very well,"useful,functions",NotFound,NotFound,NotFound,NotFound,"useful,functions",MIXED,Usability


In [None]:
train_tagged_data = label_data[:400]
len(train_tagged_data)
#train_tagged_data.head()

400

In [None]:
test_tagged_data = label_data[400:500]
len(test_tagged_data)

100

In [None]:
# Count the occurrences of each sentiment type in the 'Aspect_Sentiment' column
sentiment_counts = train_tagged_data['Aspect_Sentiment'].value_counts()

# Display the counts for each sentiment type
print(sentiment_counts)

Aspect_Sentiment
POSITIVE    180
NEGATIVE    110
MIXED       110
Name: count, dtype: int64


Choose balanced data

In [None]:
# Filter rows for each sentiment and sample 128 rows from each category
positive_samples = train_tagged_data[train_tagged_data['Aspect_Sentiment'] == 'POSITIVE'].sample(110, random_state=42)
negative_samples = train_tagged_data[train_tagged_data['Aspect_Sentiment'] == 'NEGATIVE'].sample(110, random_state=42)
mixed_samples = train_tagged_data[train_tagged_data['Aspect_Sentiment'] == 'MIXED'].sample(110, random_state=42)

# Concatenate the samples into a single DataFrame
train_tagged_data = pd.concat([positive_samples, negative_samples, mixed_samples], ignore_index=True)
len(train_tagged_data)
# Display the result
print(train_tagged_data)


     Rating                                        Review_Text      Usability  \
0         1  peek a fun!: we got these as a gift for our on...          moves   
1         1  great for the money: the picture on this camer...       NotFound   
2         1  easy and worth the price.: these took about 2 ...       install,   
3         1  awsome!!: really a life saver when it comes to...           work   
4         1  "super" fun: rated for 3+ but my 2 and 3 yr. o...  playing,using   
..      ...                                                ...            ...   
325       0  d'oh: every year i think we get going too quic...       NotFound   
326       0  small didn't work: it is very small compared t...           work   
327       0  not what i expected: i bought this boombox for...  functionality   
328       0  burn rubber on me: i went by the squeem chart ...           wear   
329       0  this charger feels like a vending machine toy....       NotFound   

           Price           

In [None]:
senti_train_data = train_tagged_data[['Rating', 'Review_Text']]
senti_train_data.head(5)

Unnamed: 0,Rating,Review_Text
0,1,peek a fun!: we got these as a gift for our on...
1,1,great for the money: the picture on this camer...
2,1,easy and worth the price.: these took about 2 ...
3,1,awsome!!: really a life saver when it comes to...
4,1,"""super"" fun: rated for 3+ but my 2 and 3 yr. o..."


Aspect keyword dictionary

In [None]:
aspect_keywords = {
    'usability': ['difficult', 'complicated','intuitive','useful',
 'functions',
 'wear',
 'pulled',
 'workout',
 'roll down',
 'rolled up down',
 'flexibility',
 'using',
 'useless',
 'work',
 'making',
 'function',
 'holding',
 'playing',
 'comfy',
 'fun',
 'uncomfortable',
 'taste',
 'tastes',
 'performance',
 'use',
 'moves',
 'works',
 'learning tool',
 'stacking',
 'knocking',
 'playtime',
 'spin',
 'twirl',
 'educational',
 'worked',
 'rotate',
 'used',
 'played',
 'boring',
 'portable',
 'comfortable',
 'worn',
 'play',
 'assemble',
 'drag',
 'absorbs',
 'pull',
 'wearing',
 'pushed',
 'flexible',
 'roll up',
 'wore',
 'squeezed',
 'hold',
 'feel',
 'holds',
 'bruising',
 'pushing',
 'working',
 'push',
 'put it on',
 'pulling',
 'hurts',
 'lifting',
 'put on',
 'does the job',
 'felt',
 'burned',
 'eating',
 'attention',
 'user friendly',
 'feeding',
 'warming',
 'disassemble',
 'functional',
 'effective',
 'assembly',
 'easy to put',
 'assembled',
 'operated',
 'scrubbed',
 'roll over',
 'flickering',
 'stopped',
 'stayed',
 'usage',
 'plugged',
 'stops working',
 'job',
 'functionality',
 'install',
 'uses',
 'turning',
 'feels',
 'programmed',
 'riding',
 'unstable',
 'design',
 'installation',
 'installed',
 'assembling',
 'installing',
 'designed',
 'usable',
 'User-friendly',
  'Intuitive',
'user friendly',
  'Easy to use',
  'Straightforward',
  'Simple interface',
  'Accessible',
  'Convenient',
  'Seamless experience',
  'Effortless',
  'Responsive',
  'Smooth navigation',
  'Learning curve',
  'Customizable',
  'Ergonomic',
 'Clear instructions',
  'Interactive',
  'Efficient',
 'Time-saving',
  'Functional',
'Cluttered'],
    'price': ['cost',
 'price',
 'inexpensive',
 'investment',
 'money',
 'penny',
 'pay',
 'cheap',
 'spent',
 'pricy',
 'priced',
 'expensive',
 'cheaper',
 'costs',
 'cheapest',
 'free',
 'paid',
 'dollar',
 'overpriced',
 'bucks','over priced',
 'pricing',
 'budget',
 'tax',
 'Money',
 '0',
 'expense',
 'costly',
 'fee',
'Affordable',
 'Expensive',
'Cheap',
'Budget-friendly','budget friendly',
 'Cost-effective',
'cost effective',
  'Overpriced',
'Reasonable',
 'Inexpensive',
  'Value for money',
 'Premium-priced',
'premium priced',
'Worth the price',
  'High-priced',
'high priced',
  'Economical',
'Competitive pricing',
'Fair price',
 'Low-cost','low cost',
 'Steep price',
 'Bargain',
 'Mid-range pricing',
'Exorbitant'],
    'size': ['size',
 'fits',
 'heavy',
 'sizes',
 'chart',
 'smaller',
'Compact design',
'Snug',
'Oversized',
'Mini',
'Clunky',
'Too small',
'Too large',
'perfectly',
'slim',
 'large',
 'feet',
 'big',
 'fit',
 'longer',
 'small',
 'tiny',
 'width',
 'thin',
 'taller',
 'tight',
 'small inch',
 'skinny',
 'hefty',
 'long',
 'xl',
 'length',
 'ft',
 'inches',
 'measurement',
 'streched',
 'medium',
'xlarge',
 'sized',
 'smaller size',
'Gigantic',
 'pound',
 'tall',
 'tightness',
 'bulky',
 'sizing',
 'measure',
 'shorter',
 'short',
 'tighter',
 'inch',
 'size chart',
 'xs',
 'high',
 'measured',
 'stouter',
 'wider',
 'x-large',
 'mediums',
 'bigger',
 'foot',
 'lower',
 'height',
 'lowering',
 'fitted',
 'higher',
 'lowered',
'Cumbersome',
 'larger'],
    'service': ['customer service', "support", "help", "assistance"'manual',
 'instructions',
 'contact',
 'seller',
 'shipping',
 'return',
 'arrived',
 'cardboard box',
 'box',
 'packaging',
 'packaged',
 'date',
 'contacted',
 'response',
 'refund',
 'apologized',
 'trust',
 'duplicate',
 'delivered',
 'advertisement',
 'instructional',
 'description',
 'policy',
 'unprofessional',
 'advertised',
 'pollicies',
 'replied',
 'fake',
 'company',
 'missing',
 'production',
 'consumers',
 'waiting',
 'warranty',
 'email',
 'advertising',
 'shipped',
 'misleading',
 'packed',
 'customer',
 'service',
 'replacing',
 'returns',
 'sent back',
 'advertized',
 'exchange',
 'package',
 'ship',
 'advertises',
 'comply',
 'contacting',
 'respond',
 'delivery',
 'dellivery',
 'warn',
 'described',
 'details',
 'miswire',
 'manufacturer',
 'tech staff',
 'mentioned',
 'lack',
 'faulty',
 'arrive',
 'repairable',
 'emails',
 'calls',
 'companies',
 'reply',
 'inquiries',
 'customer service',
 'receipt',
 'customers',
 'manufacture',
 'manufacturers',
 'apology',
 'receive',
 'mention',
 'condition',
 'unboxed',
 'misrepresented',
 'timely',
 'misunderstood',
 'specify',
 'packing',
 'reliable',
 'sealed',
 'refunded',
 'serviced',
 'refurbished',
 'reimbursement',
 'reported',
 'emailed',
 'shipment',
 'explained',
 'specs',
 'miss-leading','miss leading',
 'Responsive',
  'Unhelpful',
  'Friendly',
 'Rude',
 'Knowledgeable',
  'Incompetent',
' Polite',
'Prompt',
  'Slow',
'Efficient',
'Unresponsive',
'Fast',
'Delayed',
'On-time',
'Late',
'Efficient',
'Well-packaged',
' Damaged in transit',
'Safe',
 'Smooth',
 'Poor handling',
 'Excellent follow-up','excellent followup',
 'Delayed response',
'Poor after-care',
'Supportive',
'Neglectful',
'Warranty fulfillment',
'Difficult return process',
 'Great replacement service',
 'Transparent',
 'Lack of updates',
 'Frequent follow-ups',
' No contact'],
    'quality': ['high quality', 'poor quality', 'well-made', 'durable', 'cheap material','quality',
 'broken',
 'tore',
 'lasts',
 'inferior',
 'solid',
 'brass',
 'scraped',
 'smells',
 'delicate',
 'plastic',
 'stiff',
 'tolerate',
 'textured',
 'chinsy',
 'blunt',
 'sharp edges',
 'sharp',
 'waterproof',
 'soft',
 'smell',
 'smooths',
 'broke',
 'poor',
 'textureline',
 'fabric',
 'scratched',
 'metal',
 'smooth',
 'damage',
 'poorly',
 'flimsy',
 'weak',
 'blur',
 'stainless steel',
 'rubbery',
 'rubber',
 'material',
 'sturdy',
 'repair',
 'defective',
 'wrinkles',
 'smelled',
 'described',
 'last longer',
 'mark',
 'uneffected',
 'undamaged',
 'strong',
 'durable',
 'thick',
 'poorest',
 'damaged',
 'break',
 'thicker',
 'reliable',
 'low-grade',
 'tarnish',
 'breaks',
 'lasted',
 'leather',
 'conductive',
 'steady',
 'latex',
 'sleek',
 'weaker',
 'melted',
 'steel',
 'came off',
 'quallity',
 'cotton',
 'overheating',
 'lather',
 'rusted',
 'durability',
 'poor quality',
 'materials',
 'rubberized',
 'lasting',  'Flimsy',
  'Sturdy',
  'Weak',
  'Cheap',
  'Solid',
  'Fragile',
  'Heavy-duty',
 'high-quality',
  'substandard',
  'premium',
  'inferior',
  'top-notch',
  'poor-quality',
  'reliable',
  'cheap materials',
  'luxurious',
  'Wear-resistant','wear resistent',
  'Reliable',
  'Faulty',
  'Efficient',
  'Unreliable',
  'Smooth operation',
  'Problematic',
  'Consistent',
  'Malfunctioning',
  'High-performance','high performance',
  'High-quality','high quality',
  'Well-finished','well finished',
  'Low-quality performance',
  'Low-quality','low quality',
  'Rough',
  'Sleek',
  'Poor craftsmanship',
  'Polished',
  'Scratched',
  'Elegant',
  'Shoddy',
'Clean',
  'Breaks easily',
  'Wears out quickly',
  'Enduring',
  'Short lifespan',
  'Resilient',
  'Prone to damage',
  'Retains quality',
  'Fades quickly',
  'Maintains durability',
  'Works perfectly',
  'Defective',
  'Smooth functioning',
 'Prone to malfunction',
  'Glitchy',
  'Operational issues',
  'Performs as expected',
  'Faulty mechanism',
  'Seamless performance',
 'Unreliable performance'],
    'durability': [ 'Sturdy',
  'Solid build',
  'Well-constructed',
  'Tough',
  'Reinforced',
  'Reliable',
  'Long-lasting',
  'Flimsy',
  'Breaks easily',
  'Delicate',
  'Poorly made',
  'Brittle',
  'Weak materials',
  'Resistant to wear',
  'Scratch-resistant',
  'Tear-resistant',
  'Fades over time',
  'Worn out quickly',
  'Dents/scratches',
  'Holds up well',
  'Durable over time',
  'Short lifespan',
  'Long-lasting performance',
  'Fails after a few uses',
  'Survives heavy use',
  'Withstands daily wear',
    'High-quality materials',
  'Poor-quality'
  'components',
  'Heavy-duty',
  'Lightweight but durable',
  'Cheap materials',
'Durable finish'],
    'packing': [  'Well-packaged',
  'Poorly packaged',
  'Sturdy',
  'Fragile',
  'Durable',
  'Flimsy',
  'Protective',
  'Robust',
  'Substandard',
  'Intact',
  'Damaged',
  'Secure',
  'Crushed',
  'Dented',
  'Pristine',
  'Scratched',
  'Broken',
  'Attractive packaging',
  'Neat',
  'Elegant',
  'Basic',
  'Professional',
  'Cheap-looking','cheap looking',
  'Overpackaged',
  'Minimalistic',
  'Eye-catching','eye catching',
  'Eco-friendly','eco friendly',
  'Excessive plastic',
  'Biodegradable',
  'Recyclable',
  'Wasteful',
  'Environmentally conscious',
  'Sustainable',
  'Non-recyclable',
  'Well-secured','well secured',
  'Loose',
  'Protective padding',
  'No padding',
  'Secure fit',
  'Items shifted',
  'Unstable',
 'Safely packed'],
  }

In [None]:
def extract_keywords(review, keywords):
    # For each keyword, check if it exists in the review and return matched keyword
    found_keywords = [keyword for keyword in keywords if re.search(rf"\b{keyword.lower()}\b", review.lower())]

    return ', '.join(found_keywords) if found_keywords else None

# Extract aspect terms and store them in respective columns
for aspect, keywords in aspect_keywords.items():
    # Apply the extraction function for each review
    senti_train_data[aspect] = senti_train_data['Review_Text'].apply(lambda review: extract_keywords(review, keywords))

# Display the DataFrame with extracted aspect terms
senti_train_data.head(5)

Unnamed: 0,Rating,Review_Text,usability,price,size,service,quality,durability,packing
0,1,peek a fun!: we got these as a gift for our on...,"fun, use",,high,,tolerate,,
1,1,great for the money: the picture on this camer...,,"price, money, Money",,,,,
2,1,easy and worth the price.: these took about 2 ...,"rotate, install","price, Worth the price",,"contact, arrived",,,
3,1,awsome!!: really a life saver when it comes to...,work,"money, Money",,,,,
4,1,"""super"" fun: rated for 3+ but my 2 and 3 yr. o...","using, making, playing, fun, play","cheap, Cheap",,box,"plastic, Cheap",,Basic


Aspect Category

In [None]:
senti_train_data = senti_train_data.reset_index(drop=True)

# Initialize aspects with empty strings to match the length of data_filtered
aspects = [''] * len(senti_train_data)

for i in range(len(senti_train_data)):
    aspect_values = []
    for col in ['usability', 'price', 'size', 'service', 'quality', 'durability', 'packing']:
        if pd.notna(senti_train_data.loc[i, col]) and senti_train_data.loc[i, col] != "NotFound":
            aspect_values.append(col)

    if aspect_values:
        aspect_values.sort()
        asp = ','.join(aspect_values)
        aspects[i] = asp  # Update the aspects list at the correct index

senti_train_data['Aspect_Category'] = aspects
senti_train_data.head(5)

Unnamed: 0,Rating,Review_Text,usability,price,size,service,quality,durability,packing,Aspect_Category
0,1,peek a fun!: we got these as a gift for our on...,"fun, use",,high,,tolerate,,,"quality,size,usability"
1,1,great for the money: the picture on this camer...,,"price, money, Money",,,,,,price
2,1,easy and worth the price.: these took about 2 ...,"rotate, install","price, Worth the price",,"contact, arrived",,,,"price,service,usability"
3,1,awsome!!: really a life saver when it comes to...,work,"money, Money",,,,,,"price,usability"
4,1,"""super"" fun: rated for 3+ but my 2 and 3 yr. o...","using, making, playing, fun, play","cheap, Cheap",,box,"plastic, Cheap",,Basic,"packing,price,quality,service,usability"


Lexicon Dictionary

In [None]:
aspect_lexicon = {
    'aspect_positive' : ['intuitive',
'useful',
'functions',
'comfy',
'fun',
'performance',
'learning tool',
'educational',
'portable',
'comfortable',
'user friendly',
'functional',
'effective',
'easy to put',
'seamless experience',
'effortless',
'responsive',
'smooth navigation',
'customizable',
'ergonomic',
'clear instructions',
'interactive',
'efficient',
'time-saving','Compact design',
'Perfectly',
'Slim',
'Fit',
'Fitted',
'Wider',
'Medium',
'Mediums',
'Bigger',
'Longer',
'Taller','Responsive',
'Friendly',
'Knowledgeable',
'Polite',
'Prompt',
'Efficient',
'Fast',
'On-time',
'Well-packaged',
'Excellent follow-up',
'Supportive',
'Warranty fulfillment',
'Great replacement service',
'Transparent',
'Timely',
'Reliable',
'Safe',
'Smooth',
'Serviced',
'Reimbursement',
'Refunded',
'Trust',
'Packaged',
'Shipped','high quality',
'well-made',
'durable',
'solid',
'brass',
'waterproof',
'soft',
'smooth',
'sturdy',
'last longer',
'uneffected',
'undamaged',
'strong',
'thick',
'reliable',
'lasted',
'conductive',
'steady',
'sleek',
'durability',
'lasting',
'Sturdy',
'Solid',
'Heavy-duty',
'High-quality',
'Premium',
'Top-notch',
'Reliable',
'Luxurious',
'Wear-resistant',
'Efficient',
'Smooth operation',
'Consistent',
'High-performance',
'Well-finished',
'Sleek',
'Elegant',
'Clean',
'Enduring',
'Resilient',
'Retains quality',
'Maintains durability',
'Works perfectly',
'Smooth functioning',
'Seamless performance','Well-packaged',
'Sturdy',
'Durable',
'Protective',
'Robust',
'Intact',
'Attractive packaging',
'Neat',
'Elegant',
'Professional',
'Eco-friendly',
'Biodegradable',
'Recyclable',
'Environmentally conscious',
'Sustainable',
'Well-secured',
'Safely packed',
'Secure fit','Sturdy',
'Solid build',
'Well-constructed',
'Tough',
'Reinforced',
'Reliable',
'Long-lasting',
'Resistant to wear',
'Scratch-resistant',
'Tear-resistant',
'Holds up well',
'Durable over time',
'Long-lasting performance',
'Survives heavy use',
'Withstands daily wear',
'High-quality materials',
'Heavy-duty',
'Lightweight but durable',
'Durable finish'],
    'aspect_negative':['difficult',
'complicated',
'useless',
'boring',
'uncomfortable',
'bruising',
'unstable',
'cluttered',
'hurts',
'stopped',
'stopped working','Affordable',
'Budget-friendly',
'Cost-effective',
'Reasonable',
'Inexpensive',
'Value for money',
'Bargain',
'Competitive pricing',
'Fair price',
'Economical','Cheap',
'Pricy',
'Expensive',
'Overpriced',
'Costly',
'High-priced',
'Premium-priced',
'Steep price',
'Exorbitant',
'Cheaper',
'Cheapest','Heavy',
'Smaller',
'Oversized',
'Mini',
'Too small',
'Too large',
'Tiny',
'Small',
'Skinny',
'Shorter',
'Short',
'Cumbersome',
'Stouter',
'Tight',
'Larger',
'Gigantic','Unhelpful',
'Rude',
'Incompetent',
'Slow',
'Delayed',
'Late',
'Unresponsive',
'Poor handling',
'Damaged in transit',
'Poor after-care',
'Neglectful',
'Difficult return process',
'Lack of updates',
'No contact',
'Misleading',
'Fake',
'Unprofessional',
'Apologized',
'Refund',
'Misrepresented',
'Miswire',
'Faulty',
'Emails',
'Calls',
'Poorly packed',
'Broken',
'Warranty issues',
'Misunderstood',
'Poor policies',
'Damaged','poor quality',
'cheap material',
'broken',
'tore',
'inferior',
'scraped',
'smells',
'delicate',
'stiff',
'chinsy',
'blunt',
'sharp edges',
'broke',
'poor',
'damage',
'poorly',
'flimsy',
'weak',
'blur',
'defective',
'wrinkles',
'smelled',
'poorest',
'damaged',
'break',
'low-grade',
'tarnish',
'breaks',
'weaker',
'melted',
'quallity',
'overheating',
'rusted',
'Poor-quality',
'Faulty',
'Unreliable',
'Problematic',
'Malfunctioning',
'Low-quality performance',
'Rough',
'Poor craftsmanship',
'Shoddy',
'Breaks easily',
'Wears out quickly',
'Short lifespan',
'Prone to damage',
'Fades quickly',
'Defective',
'Prone to malfunction',
'Glitchy',
'Operational issues',
'Faulty mechanism',
'Unreliable performance','Poorly packaged',
'Fragile',
'Flimsy',
'Substandard',
'Damaged',
'Crushed',
'Dented',
'Scratched',
'Broken',
'Cheap-looking',
'Overpackaged',
'Excessive plastic',
'Wasteful',
'Non-recyclable',
'Loose',
'No padding',
'Items shifted',
'Unstable','Flimsy',
'Breaks easily',
'Delicate',
'Poorly made',
'Brittle',
'Weak materials',
'Fades over time',
'Worn out quickly',
'Dents/scratches',
'Short lifespan',
'Fails after a few uses',
'Poor-quality',
'Cheap materials'],
    'aspect_neutral':['wear',
'pulled',
'workout',
'roll down',
'rolled up down',
'flexibility',
'using',
'work',
'making',
'function',
'holding',
'playing',
'moves',
'works',
'stacking',
'knocking',
'playtime',
'spin',
'twirl',
'worked',
'rotate',
'used',
'played',
'drag',
'absorbs',
'pull',
'wearing',
'pushed',
'flexible',
'roll up',
'wore',
'squeezed',
'hold',
'feel',
'holds',
'pushing',
'working',
'push',
'put it on',
'pulling',
'lifting',
'put on',
'does the job',
'felt',
'burned',
'eating',
'attention',
'feeding',
'warming',
'disassemble',
'assembly',
'installed',
'assembling',
'installing',
'designed',
'usable',
'job',
'functionality',
'install',
'uses',
'turning',
'feels',
'programmed',
'riding','Cost',
'Price',
'Investment',
'Money',
'Penny',
'Pay',
'Priced',
'Pricing',
'Budget',
'Tax',
'Expense',
'Fee',
'Paid',
'Dollar',
'Bucks',
'Costs',
'Mid-range pricing','Size',
'Fits',
'Sizes',
'Chart',
'Measurement',
'Sizing',
'Length',
'Width',
'Feet',
'Inches',
'Pound',
'Height',
'Stretched',
'XL',
'X-large',
'XS',
'Small inch',
'Long',
'Short',
'Tightness',
'Tighter',
'Lowering',
'Lowered',
'Hefty',
'Bulky',
'Clunky','Customer service',
'Support',
'Help',
'Assistance',
'Contact',
'Seller',
'Shipping',
'Return',
'Arrived',
'Box',
'Packaging',
'Contacted',
'Delivered',
'Description',
'Reply',
'Shipment',
'Manufacturer',
'Mention',
'Condition',
'Emailed',
'Described',
'Details',
'Companies',
'Receipt',
'Customers',
'Manufacture',
'Returned',
'Shipment','quality',
'plastic',
'tolerate',
'textured',
'sharp',
'fabric',
'metal',
'smooths',
'stainless steel',
'rubbery',
'rubber',
'material',
'repair',
'described',
'mark',
'leather',
'latex',
'steel',
'came off',
'cotton',
'lather',
'materials',
'rubberized',
'cotton','Basic',
'Minimalistic']
}

Sentiment classification with respect to lexicon

In [None]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
import re
import pandas as pd

# Function to get WordNet POS tag for SentiWordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Function to classify sentiment based on found keywords using SentiWordNet
def classify_sentiment(found_keywords, lexicon):
    pos_count, neg_count, neu_count = 0, 0, 0

    for keyword in found_keywords:
        if keyword in lexicon['aspect_positive']:
            pos_count += 1
        elif keyword in lexicon['aspect_negative']:
            neg_count += 1
        elif keyword in lexicon['aspect_neutral']:
            neu_count += 1

    # Return sentiment based on counts
    if pos_count > neg_count and pos_count > neu_count:
        return 'positive'
    elif neg_count > pos_count and neg_count > neu_count:
        return 'negative'
    else:
        return 'neutral'

# Function to extract keywords and classify sentiment
def extract_keywords_and_classify(review, keywords, lexicon):
    found_keywords = [keyword for keyword in keywords if re.search(rf"\b{keyword.lower()}\b", review.lower())]
    if found_keywords:
        return classify_sentiment(found_keywords, lexicon)
    return 'neutral'


# Define the scoring for each aspect
aspect_scores = {
    'quality': {'positive': 1, 'negative': -1, 'neutral': -0.2},
    'price': {'positive': 0.8, 'negative': -0.8, 'neutral': 0.1},
    'size': {'positive': 1, 'negative': -1, 'neutral': 0},
    'usability': {'positive': 1, 'negative': -1, 'neutral': 0},
    'service': {'positive': 0.7, 'negative': -0.7, 'neutral': 0},
    'durability': {'positive': 1, 'negative': -1, 'neutral': 0.1},
    'packing': {'positive': 0.8, 'negative': -0.8, 'neutral': 0},
}

# Apply sentiment classification and scoring to each aspect column
for aspect in aspect_scores.keys():
  senti_train_data[f'{aspect}_sentiment'] = senti_train_data['Review_Text'].apply(
        lambda review: extract_keywords_and_classify(review, aspect_keywords[aspect], aspect_lexicon)
  )
  senti_train_data[f'{aspect}_score'] = senti_train_data[f'{aspect}_sentiment'].apply(lambda sentiment: aspect_scores[aspect][sentiment])

# Define the rule for overall score calculation
def apply_overall_score(row):
    return sum(row[f'{aspect}_score'] for aspect in aspect_scores.keys())

# Apply overall score calculation
senti_train_data['overall_score'] = senti_train_data.apply(apply_overall_score, axis=1)


# Function to classify overall sentiment based on the overall score and Rating
def classify_overall_sentiment(row):
    score = row['overall_score']
    rating = row['Rating']

    if rating == 1 and score > 0.3:
        return 'positive'
    elif score < -0.3:
        return 'negative'
    else:
        return 'neutral'# Classify overall sentiment based on the overall score and Rating
senti_train_data['overall_sentiment'] = senti_train_data.apply(classify_overall_sentiment, axis=1)

# Classify overall sentiment based on the overall score and Rating
senti_train_data['overall_sentiment'] = senti_train_data.apply(classify_overall_sentiment, axis=1)

# Check the result
senti_train_data.tail(5)

Unnamed: 0,Rating,Review_Text,usability,price,size,service,quality,durability,packing,Aspect_Category,...,usability_sentiment,usability_score,service_sentiment,service_score,durability_sentiment,durability_score,packing_sentiment,packing_score,overall_score,overall_sentiment
325,0,d'oh: every year i think we get going too quic...,making,free,,,,,,"price,usability",...,neutral,0,neutral,0.0,neutral,0.1,neutral,0.0,0.0,neutral
326,0,small didn't work: it is very small compared t...,work,,small,,,,,"size,usability",...,neutral,0,neutral,0.0,neutral,0.1,neutral,0.0,0.0,neutral
327,0,not what i expected: i bought this boombox for...,functionality,,,,"durable, quality, durable",,Durable,"packing,quality,usability",...,neutral,0,neutral,0.0,neutral,0.1,positive,0.8,2.0,neutral
328,0,burn rubber on me: i went by the squeem chart ...,wear,,"size, chart, fit, tight, bulky","return, refund, exchange","rubber, smelled",,,"quality,service,size,usability",...,neutral,0,neutral,0.0,neutral,0.1,neutral,0.0,0.0,neutral
329,0,this charger feels like a vending machine toy....,"felt, feels",,,,poorly,Poorly made,,"durability,quality,usability",...,neutral,0,neutral,0.0,negative,-1.0,neutral,0.0,-1.9,negative


In [None]:
positive_count_overall = senti_train_data['overall_sentiment'].value_counts().get('positive', 0)
negative_count_overall = senti_train_data['overall_sentiment'].value_counts().get('negative', 0)
neutral_count_overall = senti_train_data['overall_sentiment'].value_counts().get('neutral', 0)

print(f"Overall Sentiment counts - Positive: {positive_count_overall}, Negative: {negative_count_overall}, Neutral: {neutral_count_overall}")

Overall Sentiment counts - Positive: 29, Negative: 95, Neutral: 206


In [None]:

train_tagged_data.head()

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment,Aspect_Category
0,1,peek a fun!: we got these as a gift for our on...,moves,NotFound,NotFound,NotFound,tolerate,"moves,tolerate",POSITIVE,"Quality,Usability"
1,1,great for the money: the picture on this camer...,NotFound,"money,price",NotFound,NotFound,NotFound,"money,price",POSITIVE,Price
2,1,easy and worth the price.: these took about 2 ...,"install,","price,",NotFound,NotFound,NotFound,"install,price,$",POSITIVE,"Price,Usability"
3,1,awsome!!: really a life saver when it comes to...,work,money,NotFound,NotFound,NotFound,"work,money",POSITIVE,"Price,Usability"
4,1,"""super"" fun: rated for 3+ but my 2 and 3 yr. o...","playing,using","cheap,",NotFound,box,plastic,"playing,using,cheap,$,box,plastic",POSITIVE,"Price,Quality,Service,Usability"


In [None]:
test_tagged_data = label_data[400:500]
len(test_tagged_data)

100

# Snorkel to create labels for unlabelled data

In [None]:
ABSTAIN = -1
PRICE = 0
QUALITY = 1
SERVICE = 2
SIZE = 3
USABILITY = 4
# labelling function for Price
@labeling_function()
def lf_price(x):
    count = 0
    review=x.get(key='Review_Text')
    for elem in price_aspect_terms:
      if re.search('\\b'+elem+'\\b',review):
        count=count+1
    if count>=1:
        return PRICE
    else:
        return ABSTAIN

# labelling function for Quality
@labeling_function()
def lf_quality(x):
    count = 0
    review=x.get(key='Review_Text')
    for elem in quality_Aspect_terms:
      if re.search('\\b'+elem+'\\b',review):
        count=count+1
    if count>=1:
        return QUALITY
    else:
        return ABSTAIN

# labelling function for Service
@labeling_function()
def lf_service(x):
    count = 0
    review=x.get(key='Review_Text')
    for elem in service_aspect_terms:
      if re.search('\\b'+elem+'\\b',review):
        count=count+1
    if count>=1:
        return SERVICE
    else:
        return ABSTAIN

# labelling function for Size
@labeling_function()
def lf_size(x):
    count = 0
    review=x.get(key='Review_Text')
    for elem in size_aspect_terms:
      if re.search('\\b'+elem+'\\b',review):
        count=count+1
    if count>=1:
        return SIZE
    else:
        return ABSTAIN

# labelling function for Usability
@labeling_function()
def lf_usability(x):
    count = 0
    review=x.get(key='Review_Text')
    for elem in usability_aspect_terms:
      if re.search('\\b'+elem+'\\b',review):
        count=count+1
    if count>=1:
        return USABILITY
    else:
        return ABSTAIN
# Define the set of labeling functions (LFs)
lfs = [lf_price,lf_size,lf_service,lf_quality,lf_usability]

# Apply the LFs to the tagged training data and test using tagged test data
applier = PandasLFApplier(lfs)
L_train = applier.apply(train_tagged_data)
L_test = applier.apply(test_tagged_data)
## Evaluate Coverage on train set
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|██████████| 330/330 [00:01<00:00, 254.19it/s]
100%|██████████| 100/100 [00:00<00:00, 258.75it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_price,0,[0],0.348485,0.330303,0.330303
lf_size,1,[3],0.493939,0.454545,0.454545
lf_service,2,[2],0.387879,0.360606,0.360606
lf_quality,3,[1],0.463636,0.424242,0.424242
lf_usability,4,[4],0.681818,0.590909,0.590909


In [None]:
import snorkel
from sklearn import preprocessing
from sklearn.metrics import classification_report
print(snorkel.__version__)
le = preprocessing.LabelEncoder()

Y_ASPECT_CATEGORY = ['Price', 'Quality', 'Service', 'Size', 'Usability']

le.fit(['Price', 'Quality', 'Service', 'Size', 'Usability'])

# fit a MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit_transform(Y_ASPECT_CATEGORY)

# create a majority vote model and predict
majority_model = MajorityLabelVoter(cardinality=5)
predictions = majority_model.predict_proba(L=L_train)
df_multilabel = pd.DataFrame()
df_multilabel['predict_proba'] = predictions.tolist()

# get all the non zero indices which are the multi labels
df_multilabel['multi_labels'] = df_multilabel['predict_proba'].apply(lambda x: np.nonzero(x)[0])

#transform to mlb for classification report
df_multilabel['mlb_pred'] = df_multilabel['multi_labels'].apply(lambda x: mlb.transform([x])[0])

#convert to str in order to see how many multi labels did we gain
multi_label_string = df_multilabel.multi_labels.apply(lambda x: ", ".join(le.inverse_transform(x)))
print(multi_label_string.value_counts()[:50])

0.9.9
multi_labels
Size, Usability                             36
Usability                                   30
Quality, Usability                          27
Quality, Size, Usability                    18
Price, Usability                            16
Size                                        13
Quality                                     13
Service, Size                               12
Price, Quality, Size, Usability             12
Service, Size, Usability                    11
Price, Quality, Service, Usability          11
Quality, Service, Size, Usability           11
Price, Service, Size, Usability             10
Price, Service                              10
Quality, Service, Usability                 10
Price, Quality, Service, Size, Usability    10
Service                                      9
Quality, Size                                8
Price, Quality                               7
Quality, Service                             7
Service, Usability                       

In [None]:
train_tagged_data['Snorkel_Aspect_Category']=multi_label_string
train_tagged_data.head()

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment,Aspect_Category,Snorkel_Aspect_Category
0,1,peek a fun!: we got these as a gift for our on...,moves,NotFound,NotFound,NotFound,tolerate,"moves,tolerate",POSITIVE,"Quality,Usability","Service, Size"
1,1,great for the money: the picture on this camer...,NotFound,"money,price",NotFound,NotFound,NotFound,"money,price",POSITIVE,Price,"Quality, Size, Usability"
2,1,easy and worth the price.: these took about 2 ...,"install,","price,",NotFound,NotFound,NotFound,"install,price,$",POSITIVE,"Price,Usability","Size, Usability"
3,1,awsome!!: really a life saver when it comes to...,work,money,NotFound,NotFound,NotFound,"work,money",POSITIVE,"Price,Usability",Service
4,1,"""super"" fun: rated for 3+ but my 2 and 3 yr. o...","playing,using","cheap,",NotFound,box,plastic,"playing,using,cheap,$,box,plastic",POSITIVE,"Price,Quality,Service,Usability","Price, Usability"


In [None]:
train_data_array=train_tagged_data[Y_ASPECT_CATEGORY].to_numpy()
ytrain_true_aspect_category=np.where(train_data_array=="NotFound", 0, train_data_array)
ytrain_true_aspect_category=np.where(ytrain_true_aspect_category!=0, 1, ytrain_true_aspect_category)
ytrain_true_aspect_category=ytrain_true_aspect_category.astype(int)
ytrain_true_aspect_category[:3]

array([[0, 1, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 1]])

Snorkel predicted labels for train data

In [None]:
ytrain_snorkel_aspect_category=np.where(predictions!=0., 1, predictions)
ytrain_snorkel_aspect_category=ytrain_snorkel_aspect_category.astype(int)
ytrain_snorkel_aspect_category[:3]

array([[0, 1, 0, 1, 1],
       [1, 0, 0, 0, 0],
       [1, 0, 1, 0, 1]])

Predict the Aspect Category for the Test data using Snorkel

In [None]:
predictions = majority_model.predict_proba(L=L_test)
df_multilabel = pd.DataFrame()
df_multilabel['predict_proba'] = predictions.tolist()

# get all the non zero indices which are the multi labels
df_multilabel['multi_labels'] = df_multilabel['predict_proba'].apply(lambda x: np.nonzero(x)[0])

#transform to mlb for classification report
df_multilabel['mlb_pred'] = df_multilabel['multi_labels'].apply(lambda x: mlb.transform([x])[0])

#convert to str in order to see how many multi labels did we gain
multi_label_string = df_multilabel.multi_labels.apply(lambda x: ", ".join(le.inverse_transform(x)))
print(multi_label_string.value_counts()[:50])

multi_labels
Size, Usability                             14
Quality, Usability                           9
Price, Service, Usability                    7
Usability                                    7
Service, Size, Usability                     6
Quality, Size, Usability                     6
Quality                                      5
Service, Usability                           5
Price, Usability                             4
Quality, Service, Size, Usability            4
Price, Quality, Service, Usability           4
Quality, Service, Usability                  4
Service                                      3
Price                                        3
Price, Size, Usability                       3
Price, Quality, Service, Size, Usability     2
Price, Quality, Size, Usability              2
Service, Size                                2
Price, Quality, Service                      2
Price, Quality, Usability                    2
Size                                         1


In [None]:
test_tagged_data=test_tagged_data.reset_index(drop=True)
test_tagged_data['Snorkel_Aspect_Category']=multi_label_string
test_tagged_data.head()

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment,Aspect_Category,Snorkel_Aspect_Category
0,0,exercise outside: it was my mistake. i didn't ...,NotFound,NotFound,fit,"return,packing",NotFound,"fit,return,packing",NEGATIVE,"Service,Size","Service, Size"
1,1,very comfortable: it was a little difficult to...,comfortable,NotFound,sizing,NotFound,cotton,"comfortable,sizing,cotton",MIXED,"Quality,Size,Usability","Quality, Size, Usability"
2,1,a little bit big..: i love soffe shorts and we...,wear,NotFound,"big,bigger,size",NotFound,NotFound,"wear,big,bigger,size",POSITIVE,"Size,Usability","Size, Usability"
3,0,watch out for a fake!: if you are really serio...,NotFound,NotFound,NotFound,"fake,seller",NotFound,"$,fake,seller",NEGATIVE,Service,Service
4,1,perfection!: i've always used stant as replace...,NotFound,price,NotFound,NotFound,NotFound,price,POSITIVE,Price,"Price, Usability"


In [None]:
def result_metrics(true_labels, predictions):
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    print("Macro F1 Score:", macro_f1)
    macro_precision_score = precision_score(true_labels, predictions,average='macro')
    print("Macro Precision Score:",macro_precision_score)
    macro_recall_score = recall_score(true_labels, predictions,average='macro')
    print("Macro Recall Score:",macro_recall_score)

    micro_f1 = f1_score(true_labels, predictions, average='micro')
    print("Micro F1 Score:",micro_f1)
    micro_precision_score = precision_score(true_labels, predictions,average='micro')
    print("Micro Precision Score:",micro_precision_score)
    micro_recall_score = recall_score(true_labels, predictions,average='micro')
    print("Micro Recall Score:",micro_recall_score)

    hamLoss = hamming_loss(true_labels, predictions)
    print("Hamming Loss:",hamLoss)

Evaluate Snorkel labels for Train Data

In [None]:
result_metrics(ytrain_true_aspect_category,ytrain_snorkel_aspect_category)

Macro F1 Score: 0.9177541333862491
Macro Precision Score: 0.8546294727613638
Macro Recall Score: 0.9937059142702116
Micro F1 Score: 0.9160935350756534
Micro Precision Score: 0.8494897959183674
Micro Recall Score: 0.9940298507462687
Hamming Loss: 0.07393939393939394


Evaluate Snorkel labels for Test Data

In [None]:
test_data_array=test_tagged_data[Y_ASPECT_CATEGORY].to_numpy()
ytest_true_aspect_category=np.where(test_data_array=="NotFound", 0, test_data_array)
ytest_true_aspect_category=np.where(ytest_true_aspect_category!=0, 1, ytest_true_aspect_category)
ytest_true_aspect_category=ytest_true_aspect_category.astype(int)
ytest_true_aspect_category[:3]
ytest_snorkel_aspect_category=np.where(predictions!=0., 1, predictions)
ytest_snorkel_aspect_category=ytest_snorkel_aspect_category.astype(int)
ytest_snorkel_aspect_category[:3]
result_metrics(ytest_true_aspect_category,ytest_snorkel_aspect_category)

Macro F1 Score: 0.8604805485266913
Macro Precision Score: 0.7699049884224304
Macro Recall Score: 0.9870967741935484
Micro F1 Score: 0.8697674418604651
Micro Precision Score: 0.7759336099585062
Micro Recall Score: 0.9894179894179894
Hamming Loss: 0.112


Verify the Aspect Category predicted by SNORKEL for few customer reviews

In [None]:
print("Review from Customer : ",train_tagged_data["Review_Text"][0])
print("Aspect Terms : ",train_tagged_data["Aspect_Terms"][0])
print("Aspect Identified by SNORKEL : ",train_tagged_data["Snorkel_Aspect_Category"][0])

Review from Customer :  peek a fun!: we got these as a gift for our one year old. he loves them! he tries very hard to get all the little things out of the blocks - and obviously fails. he loves that some of them move, some of them jiggle, some of them make noise - its almost an adventure!the three year old also loves them because they incorporate the alphabet and he can easily identify the items inside: b - banana, p - penguin, u - umbrella, z - zebra.bonus - when the 1 y/o gets frustrated because he can't pry the items out of the block no matter how hard he tries, he usually flings them. the 3 y/o tends to use them to build walls or towers for the sole purpose of smashing another toy into them, or dropping them off high places. and i've stepped on two of them already. they seem to tolerate this treatment rather well.
Aspect Terms :  moves,tolerate  
Aspect Identified by SNORKEL :  Service, Size


In [None]:
print("Review from Customer : ",train_tagged_data["Review_Text"][2])
print("Aspect Terms : ",train_tagged_data["Aspect_Terms"][2])
print("Aspect Identified by SNORKEL : ",train_tagged_data["Snorkel_Aspect_Category"][2])

Review from Customer :  easy and worth the price.: these took about 2 minutes to install in my garage where the studs were easy to locate. it is fairly easy to mount the bike. you may want to rotate the tire after mounting so the spoke is not in contact with the hook. i got 2 from amazon for about $28 which was a good deal. they arrived a day or so earlier than projected. tire marks on my garage don't bother me much since it is an unfinished surface.
Aspect Terms :  install,price,$
Aspect Identified by SNORKEL :  Size, Usability


Use Sentiment Vader and Rating as a weak signal to learn the labels Positive, Negative, Mixed for sentiment classification

In [None]:
sid_obj = SentimentIntensityAnalyzer()

In [None]:
## Define constants for labelling
ABSTAIN = -1
POSITIVE = 1
NEGATIVE = 0
MIXED = 2

In [None]:
# Labelling function for Positive label
@labeling_function()
def lf_positive(x):
    rating=x.get(key = 'Rating')
    review=x.get(key='Review_Text')
    sentiment_dict = sid_obj.polarity_scores(review)
     # decide sentiment as positive
    if sentiment_dict['compound'] >=-0.05 and rating==1 :
        return POSITIVE
    else :
        return ABSTAIN

# Labelling function for Negative label
@labeling_function()
def lf_negative(x):
    rating=x.get(key = 'Rating')
    review=x.get(key='Review_Text')
    sentiment_dict = sid_obj.polarity_scores(review)
     # decide sentiment as negative
    if sentiment_dict['compound'] <= -0.05 and rating==0 :
        return NEGATIVE
    else :
        return ABSTAIN

# Labelling function for Mixed label
@labeling_function()
def lf_mixed(x):
    rating=x.get(key = 'Rating')
    review=x.get(key='Review_Text')
    sentiment_dict = sid_obj.polarity_scores(review)
     # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >=-0.05 and rating==0 :
        return MIXED
    elif sentiment_dict['compound'] <= -0.05 and rating==1 :
        return MIXED
    else :
        return ABSTAIN


In [None]:
lfs = [lf_positive,lf_negative,lf_mixed]

# Apply the LFs to the tagged training data and tagged testing data
applier = PandasLFApplier(lfs)
L_train = applier.apply(train_tagged_data)
L_test = applier.apply(test_tagged_data)

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(L_train, n_epochs=1000, log_freq=50)

100%|██████████| 330/330 [00:00<00:00, 390.02it/s]
100%|██████████| 100/100 [00:00<00:00, 404.03it/s]
100%|██████████| 1000/1000 [00:02<00:00, 438.08epoch/s]


In [None]:
## Evaluate Coverage on train set
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_positive,0,[1],0.333333,0.0,0.0
lf_negative,1,[0],0.330303,0.0,0.0
lf_mixed,2,[2],0.336364,0.0,0.0


In [None]:
train_tagged_data['Snorkel_Sentiment_Labels']=label_model.predict(L=L_train, tie_break_policy="abstain")

In [None]:
train_tagged_data['Snorkel_Sentiment_Labels'].replace(1,"positive",inplace=True)
train_tagged_data['Snorkel_Sentiment_Labels'].replace(0,"negative",inplace=True)
train_tagged_data['Snorkel_Sentiment_Labels'].replace(2,"neutral",inplace=True)

train_tagged_data['Rating'].replace(1,"POSITIVE",inplace=True)
train_tagged_data['Rating'].replace(0,"NEGATIVE",inplace=True)

Verify the sentiment labels predicted by Snorkel for few customer reviews

In [None]:
print("Review from Customer : ",train_tagged_data["Review_Text"][0])
print("Rating from Customer : ",train_tagged_data["Rating"][0])
print("Sentiment from SNORKEL : ",train_tagged_data["Snorkel_Sentiment_Labels"][0])

Review from Customer :  peek a fun!: we got these as a gift for our one year old. he loves them! he tries very hard to get all the little things out of the blocks - and obviously fails. he loves that some of them move, some of them jiggle, some of them make noise - its almost an adventure!the three year old also loves them because they incorporate the alphabet and he can easily identify the items inside: b - banana, p - penguin, u - umbrella, z - zebra.bonus - when the 1 y/o gets frustrated because he can't pry the items out of the block no matter how hard he tries, he usually flings them. the 3 y/o tends to use them to build walls or towers for the sole purpose of smashing another toy into them, or dropping them off high places. and i've stepped on two of them already. they seem to tolerate this treatment rather well.
Rating from Customer :  POSITIVE
Sentiment from SNORKEL :  positive


In [None]:
print("Review from Customer : ",train_tagged_data["Review_Text"][1])
print("Rating from Customer : ",train_tagged_data["Rating"][1])
print("Sentiment from SNORKEL : ",train_tagged_data["Snorkel_Sentiment_Labels"][1])

Review from Customer :  great for the money: the picture on this camera is good and it needs very little light. however, the colors are not quite as accurate as i had hoped for hi8. perhaps digital would have been better, but the price was a factor. i shopped around and found amazon had the best price available.
Rating from Customer :  POSITIVE
Sentiment from SNORKEL :  positive


In [None]:
print("Review from Customer : ",train_tagged_data["Review_Text"][4])
print("Rating from Customer : ",train_tagged_data["Rating"][4])
print("Sentiment from SNORKEL : ",train_tagged_data["Snorkel_Sentiment_Labels"][4])

Review from Customer :  "super" fun: rated for 3+ but my 2 and 3 yr. olds both play with this. while the kids like playing with this, i like that it can grow with them. it has basic tools for younger children: rolling pin, presser with three tips, pizza cutter, safety scissors, play-doh knife. but it also has tools/items for older ones: bead making, art molds for pictures and "sand".the only con is the mat. it's cheap plastic. sounded great on the box in the store, but ends up crinkled underneath them and not protecting the floor. stick with using and old sheet or vinyl table cloth from the $1 store.
Rating from Customer :  POSITIVE
Sentiment from SNORKEL :  positive


In [None]:
senti_train_data.columns

Index(['Rating', 'Review_Text', 'usability', 'price', 'size', 'service',
       'quality', 'durability', 'packing', 'Aspect_Category',
       'quality_sentiment', 'quality_score', 'price_sentiment', 'price_score',
       'size_sentiment', 'size_score', 'usability_sentiment',
       'usability_score', 'service_sentiment', 'service_score',
       'durability_sentiment', 'durability_score', 'packing_sentiment',
       'packing_score', 'overall_score', 'overall_sentiment'],
      dtype='object')

In [None]:
train_tagged_data.columns

Index(['Rating', 'Review_Text', 'Usability', 'Price', 'Size', 'Service',
       'Quality', 'Aspect_Terms', 'Aspect_Sentiment', 'Aspect_Category',
       'Snorkel_Aspect_Category', 'Snorkel_Sentiment_Labels'],
      dtype='object')

In [None]:
train_tagged_data.head(5)

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment,Aspect_Category,Snorkel_Aspect_Category,Snorkel_Sentiment_Labels
0,POSITIVE,peek a fun!: we got these as a gift for our on...,moves,NotFound,NotFound,NotFound,tolerate,"moves,tolerate",POSITIVE,"Quality,Usability","Quality, Size, Usability",positive
1,POSITIVE,great for the money: the picture on this camer...,NotFound,"money,price",NotFound,NotFound,NotFound,"money,price",POSITIVE,Price,Price,positive
2,POSITIVE,easy and worth the price.: these took about 2 ...,"install,","price,",NotFound,NotFound,NotFound,"install,price,$",POSITIVE,"Price,Usability","Price, Service, Usability",positive
3,POSITIVE,awsome!!: really a life saver when it comes to...,work,money,NotFound,NotFound,NotFound,"work,money",POSITIVE,"Price,Usability","Price, Usability",positive
4,POSITIVE,"""super"" fun: rated for 3+ but my 2 and 3 yr. o...","playing,using","cheap,",NotFound,box,plastic,"playing,using,cheap,$,box,plastic",POSITIVE,"Price,Quality,Service,Usability","Price, Quality, Service, Usability",positive


Compare two algorithm which uses sentiword and vadar

In [None]:
# Print the first few rows of `senti_train_data` and `train_tagged_data` for verification
print(senti_train_data[['overall_sentiment']].head())
print(train_tagged_data[['Snorkel_Sentiment_Labels']].head())

# Find rows where the values in `overall_sentiment` and `Snorkel_Sentiment_Labels` are different
differences = senti_train_data[senti_train_data['overall_sentiment'] != train_tagged_data['Snorkel_Sentiment_Labels']]

# Display the rows where differences exist
print(differences)


  overall_sentiment
0          positive
1           neutral
2           neutral
3           neutral
4          negative
  Snorkel_Sentiment_Labels
0                 positive
1                 positive
2                 positive
3                 positive
4                 positive
     Rating                                        Review_Text  \
1         1  great for the money: the picture on this camer...   
2         1  easy and worth the price.: these took about 2 ...   
3         1  awsome!!: really a life saver when it comes to...   
4         1  "super" fun: rated for 3+ but my 2 and 3 yr. o...   
5         1  oh!: this cereal is so sweet....yet so good fo...   
..      ...                                                ...   
317       0  did not charge my g4 unless the laptop was clo...   
318       0  this is a piece of junk: i bought this thinkin...   
322       0  even the large is too small for a child: i ord...   
323       1  haven't found better yet...: this is the only

In [None]:
# Calculate and print the number of differences
num_differences = (senti_train_data['overall_sentiment'] != train_tagged_data['Snorkel_Sentiment_Labels']).sum()
print("Number of differences:", num_differences)

Number of differences: 179


In [None]:
train_tagged_data.head(5)

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment,Aspect_Category,Snorkel_Aspect_Category,Snorkel_Sentiment_Labels
0,POSITIVE,peek a fun!: we got these as a gift for our on...,moves,NotFound,NotFound,NotFound,tolerate,"moves,tolerate",POSITIVE,"Quality,Usability","Quality, Size, Usability",positive
1,POSITIVE,great for the money: the picture on this camer...,NotFound,"money,price",NotFound,NotFound,NotFound,"money,price",POSITIVE,Price,Price,positive
2,POSITIVE,easy and worth the price.: these took about 2 ...,"install,","price,",NotFound,NotFound,NotFound,"install,price,$",POSITIVE,"Price,Usability","Price, Service, Usability",positive
3,POSITIVE,awsome!!: really a life saver when it comes to...,work,money,NotFound,NotFound,NotFound,"work,money",POSITIVE,"Price,Usability","Price, Usability",positive
4,POSITIVE,"""super"" fun: rated for 3+ but my 2 and 3 yr. o...","playing,using","cheap,",NotFound,box,plastic,"playing,using,cheap,$,box,plastic",POSITIVE,"Price,Quality,Service,Usability","Price, Quality, Service, Usability",positive


Total number of positive ,negative and neutral using sentiwordnet

In [None]:
positive_count_overall = senti_train_data['overall_sentiment'].value_counts().get('positive', 0)
negative_count_overall = senti_train_data['overall_sentiment'].value_counts().get('negative', 0)
neutral_count_overall = senti_train_data['overall_sentiment'].value_counts().get('neutral', 0)

print(f"Overall Sentiment counts - Positive: {positive_count_overall}, Negative: {negative_count_overall}, Neutral: {neutral_count_overall}")

Overall Sentiment counts - Positive: 29, Negative: 95, Neutral: 206


Total number of positive ,negative and neutral using sentiVadar Library

In [None]:
positive_count_overall = train_tagged_data['Snorkel_Sentiment_Labels'].value_counts().get('positive', 0)
negative_count_overall = train_tagged_data['Snorkel_Sentiment_Labels'].value_counts().get('negative', 0)
neutral_count_overall = train_tagged_data['Snorkel_Sentiment_Labels'].value_counts().get('neutral', 0)

print(f"Overall Sentiment counts - Positive: {positive_count_overall}, Negative: {negative_count_overall}, Neutral: {neutral_count_overall}")

Overall Sentiment counts - Positive: 110, Negative: 109, Neutral: 111


Evaluate both sentiment classification with Sentiment Vadar Classification as true value

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate and print the number of differences
num_differences = (senti_train_data['overall_sentiment'] != train_tagged_data['Snorkel_Sentiment_Labels']).sum()
print("Number of differences:", num_differences)

# Extract the true and predicted labels
y_true = senti_train_data['overall_sentiment']
y_pred = train_tagged_data['Snorkel_Sentiment_Labels']

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Display detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Number of differences: 179
Accuracy: 0.46
Precision: 0.58
Recall: 0.46
F1 Score: 0.47

Classification Report:
              precision    recall  f1-score   support

    negative       0.42      0.48      0.45        95
     neutral       0.69      0.37      0.49       206
    positive       0.25      0.97      0.40        29

    accuracy                           0.46       330
   macro avg       0.46      0.61      0.45       330
weighted avg       0.58      0.46      0.47       330



In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score

# Assuming your DataFrame is called df

# Accuracy
accuracy = accuracy_score(train_tagged_data['Snorkel_Sentiment_Labels'], senti_train_data['overall_sentiment'])
print(f"Accuracy: {accuracy}")

# Confusion Matrix
print(train_tagged_data['Snorkel_Sentiment_Labels'].unique())
print(senti_train_data['overall_sentiment'].unique())
cm = confusion_matrix(train_tagged_data['Snorkel_Sentiment_Labels'], senti_train_data['overall_sentiment'], labels=['positive', 'negative', 'neutral'])
print("Confusion Matrix:")
print(cm)

# Cohen's Kappa
kappa = cohen_kappa_score(train_tagged_data['Snorkel_Sentiment_Labels'], senti_train_data['overall_sentiment'])
print(f"Cohen's Kappa: {kappa}")

Accuracy: 0.4575757575757576
['positive' 'negative' 'neutral']
['positive' 'neutral' 'negative']
Confusion Matrix:
[[28 16 66]
 [ 0 46 63]
 [ 1 33 77]]
Cohen's Kappa: 0.18511774200223485


In [None]:
five_aspect_keywords = {
    'usability': ['difficult', 'complicated','intuitive','useful',
 'functions',
 'wear',
 'pulled',
 'workout',
 'roll down',
 'rolled up down',
 'flexibility',
 'using',
 'useless',
 'work',
 'making',
 'function',
 'holding',
 'playing',
 'comfy',
 'fun',
 'uncomfortable',
 'taste',
 'tastes',
 'performance',
 'use',
 'moves',
 'works',
 'learning tool',
 'stacking',
 'knocking',
 'playtime',
 'spin',
 'twirl',
 'educational',
 'worked',
 'rotate',
 'used',
 'played',
 'boring',
 'portable',
 'comfortable',
 'worn',
 'play',
 'assemble',
 'drag',
 'absorbs',
 'pull',
 'wearing',
 'pushed',
 'flexible',
 'roll up',
 'wore',
 'squeezed',
 'hold',
 'feel',
 'holds',
 'bruising',
 'pushing',
 'working',
 'push',
 'put it on',
 'pulling',
 'hurts',
 'lifting',
 'put on',
 'does the job',
 'felt',
 'burned',
 'eating',
 'attention',
 'user friendly',
 'feeding',
 'warming',
 'disassemble',
 'functional',
 'effective',
 'assembly',
 'easy to put',
 'assembled',
 'operated',
 'scrubbed',
 'roll over',
 'flickering',
 'stopped',
 'stayed',
 'usage',
 'plugged',
 'stops working',
 'job',
 'functionality',
 'install',
 'uses',
 'turning',
 'feels',
 'programmed',
 'riding',
 'unstable',
 'design',
 'installation',
 'installed',
 'assembling',
 'installing',
 'designed',
 'usable',
 'User-friendly',
  'Intuitive',
'user friendly',
  'Easy to use',
  'Straightforward',
  'Simple interface',
  'Accessible',
  'Convenient',
  'Seamless experience',
  'Effortless',
  'Responsive',
  'Smooth navigation',
  'Learning curve',
  'Customizable',
  'Ergonomic',
 'Clear instructions',
  'Interactive',
  'Efficient',
 'Time-saving',
  'Functional',
'Cluttered'],
    'price': ['cost',
 'price',
 'inexpensive',
 'investment',
 'money',
 'penny',
 'pay',
 'cheap',
 'spent',
 'pricy',
 'priced',
 'expensive',
 'cheaper',
 'costs',
 'cheapest',
 'free',
 'paid',
 'dollar',
 'overpriced',
 'bucks','over priced',
 'pricing',
 'budget',
 'tax',
 'Money',
 '0',
 'expense',
 'costly',
 'fee',
'Affordable',
 'Expensive',
'Cheap',
'Budget-friendly','budget friendly',
 'Cost-effective',
'cost effective',
  'Overpriced',
'Reasonable',
 'Inexpensive',
  'Value for money',
 'Premium-priced',
'premium priced',
'Worth the price',
  'High-priced',
'high priced',
  'Economical',
'Competitive pricing',
'Fair price',
 'Low-cost','low cost',
 'Steep price',
 'Bargain',
 'Mid-range pricing',
'Exorbitant'],
    'size': ['size',
 'fits',
 'heavy',
 'sizes',
 'chart',
 'smaller',
'Compact design',
'Snug',
'Oversized',
'Mini',
'Clunky',
'Too small',
'Too large',
'perfectly',
'slim',
 'large',
 'feet',
 'big',
 'fit',
 'longer',
 'small',
 'tiny',
 'width',
 'thin',
 'taller',
 'tight',
 'small inch',
 'skinny',
 'hefty',
 'long',
 'xl',
 'length',
 'ft',
 'inches',
 'measurement',
 'streched',
 'medium',
'xlarge',
 'sized',
 'smaller size',
'Gigantic',
 'pound',
 'tall',
 'tightness',
 'bulky',
 'sizing',
 'measure',
 'shorter',
 'short',
 'tighter',
 'inch',
 'size chart',
 'xs',
 'high',
 'measured',
 'stouter',
 'wider',
 'x-large',
 'mediums',
 'bigger',
 'foot',
 'lower',
 'height',
 'lowering',
 'fitted',
 'higher',
 'lowered',
'Cumbersome',
 'larger'],
    'service': ['customer service', "support", "help", "assistance"'manual',
 'instructions',
 'contact',
 'seller',
 'shipping',
 'return',
 'arrived',
 'cardboard box',
 'box',
 'packaging',
 'packaged',
 'date',
 'contacted',
 'response',
 'refund',
 'apologized',
 'trust',
 'duplicate',
 'delivered',
 'advertisement',
 'instructional',
 'description',
 'policy',
 'unprofessional',
 'advertised',
 'pollicies',
 'replied',
 'fake',
 'company',
 'missing',
 'production',
 'consumers',
 'waiting',
 'warranty',
 'email',
 'advertising',
 'shipped',
 'misleading',
 'packed',
 'customer',
 'service',
 'replacing',
 'returns',
 'sent back',
 'advertized',
 'exchange',
 'package',
 'ship',
 'advertises',
 'comply',
 'contacting',
 'respond',
 'delivery',
 'dellivery',
 'warn',
 'described',
 'details',
 'miswire',
 'manufacturer',
 'tech staff',
 'mentioned',
 'lack',
 'faulty',
 'arrive',
 'repairable',
 'emails',
 'calls',
 'companies',
 'reply',
 'inquiries',
 'customer service',
 'receipt',
 'customers',
 'manufacture',
 'manufacturers',
 'apology',
 'receive',
 'mention',
 'condition',
 'unboxed',
 'misrepresented',
 'timely',
 'misunderstood',
 'specify',
 'packing',
 'reliable',
 'sealed',
 'refunded',
 'serviced',
 'refurbished',
 'reimbursement',
 'reported',
 'emailed',
 'shipment',
 'explained',
 'specs',
 'miss-leading','miss leading',
 'Responsive',
  'Unhelpful',
  'Friendly',
 'Rude',
 'Knowledgeable',
  'Incompetent',
' Polite',
'Prompt',
  'Slow',
'Efficient',
'Unresponsive',
'Fast',
'Delayed',
'On-time',
'Late',
'Efficient',
'Well-packaged',
' Damaged in transit',
'Safe',
 'Smooth',
 'Poor handling',
 'Excellent follow-up','excellent followup',
 'Delayed response',
'Poor after-care',
'Supportive',
'Neglectful',
'Warranty fulfillment',
'Difficult return process',
 'Great replacement service',
 'Transparent',
 'Lack of updates',
 'Frequent follow-ups',
' No contact'],
    'quality': ['high quality', 'poor quality', 'well-made', 'durable', 'cheap material','quality',
 'broken',
 'tore',
 'lasts',
 'inferior',
 'solid',
 'brass',
 'scraped',
 'smells',
 'delicate',
 'plastic',
 'stiff',
 'tolerate',
 'textured',
 'chinsy',
 'blunt',
 'sharp edges',
 'sharp',
 'waterproof',
 'soft',
 'smell',
 'smooths',
 'broke',
 'poor',
 'textureline',
 'fabric',
 'scratched',
 'metal',
 'smooth',
 'damage',
 'poorly',
 'flimsy',
 'weak',
 'blur',
 'stainless steel',
 'rubbery',
 'rubber',
 'material',
 'sturdy',
 'repair',
 'defective',
 'wrinkles',
 'smelled',
 'described',
 'last longer',
 'mark',
 'uneffected',
 'undamaged',
 'strong',
 'durable',
 'thick',
 'poorest',
 'damaged',
 'break',
 'thicker',
 'reliable',
 'low-grade',
 'tarnish',
 'breaks',
 'lasted',
 'leather',
 'conductive',
 'steady',
 'latex',
 'sleek',
 'weaker',
 'melted',
 'steel',
 'came off',
 'quallity',
 'cotton',
 'overheating',
 'lather',
 'rusted',
 'durability',
 'poor quality',
 'materials',
 'rubberized',
 'lasting',  'Flimsy',
  'Sturdy',
  'Weak',
  'Cheap',
  'Solid',
  'Fragile',
  'Heavy-duty',
 'high-quality',
  'substandard',
  'premium',
  'inferior',
  'top-notch',
  'poor-quality',
  'reliable',
  'cheap materials',
  'luxurious',
  'Wear-resistant','wear resistent',
  'Reliable',
  'Faulty',
  'Efficient',
  'Unreliable',
  'Smooth operation',
  'Problematic',
  'Consistent',
  'Malfunctioning',
  'High-performance','high performance',
  'High-quality','high quality',
  'Well-finished','well finished',
  'Low-quality performance',
  'Low-quality','low quality',
  'Rough',
  'Sleek',
  'Poor craftsmanship',
  'Polished',
  'Scratched',
  'Elegant',
  'Shoddy',
'Clean',
  'Breaks easily',
  'Wears out quickly',
  'Enduring',
  'Short lifespan',
  'Resilient',
  'Prone to damage',
  'Retains quality',
  'Fades quickly',
  'Maintains durability',
  'Works perfectly',
  'Defective',
  'Smooth functioning',
 'Prone to malfunction',
  'Glitchy',
  'Operational issues',
  'Performs as expected',
  'Faulty mechanism',
  'Seamless performance',
 'Unreliable performance'],
    'durability': [ 'Sturdy',
  'Solid build',
  'Well-constructed',
  'Tough',
  'Reinforced',
  'Reliable',
  'Long-lasting',
  'Flimsy',
  'Breaks easily',
  'Delicate',
  'Poorly made',
  'Brittle',
  'Weak materials',
  'Resistant to wear',
  'Scratch-resistant',
  'Tear-resistant',
  'Fades over time',
  'Worn out quickly',
  'Dents/scratches',
  'Holds up well',
  'Durable over time',
  'Short lifespan',
  'Long-lasting performance',
  'Fails after a few uses',
  'Survives heavy use',
  'Withstands daily wear',
    'High-quality materials',
  'Poor-quality'
  'components',
  'Heavy-duty',
  'Lightweight but durable',
  'Cheap materials',
'Durable finish'],
}

In [None]:
train_tagged_data.columns

Index(['Rating', 'Review_Text', 'Usability', 'Price', 'Size', 'Service',
       'Quality', 'Aspect_Terms', 'Aspect_Sentiment', 'Aspect_Category'],
      dtype='object')

In [None]:
new_senti_train_data = train_tagged_data.copy()
# Columns to keep
columns_to_keep = ['Rating', 'Review_Text','Aspect_Terms','Snorkel_Sentiment_Labels']

# Create a new DataFrame with only the required columns
new_senti_train_data = new_senti_train_data[columns_to_keep]
def extract_keywords(review, keywords):
    # For each keyword, check if it exists in the review and return matched keyword
    found_keywords = [keyword for keyword in keywords if re.search(rf"\b{keyword.lower()}\b", review.lower())]

    return ', '.join(found_keywords) if found_keywords else None

# Extract aspect terms and store them in respective columns
for aspect, keywords in five_aspect_keywords.items():
    # Apply the extraction function for each review
    new_senti_train_data[aspect] = new_senti_train_data['Review_Text'].apply(lambda review: extract_keywords(review, keywords))

# Display the DataFrame with extracted aspect terms
new_senti_train_data.head(5)


KeyError: "['Snorkel_Sentiment_Labels'] not in index"

In [None]:
# Select the desired columns
new_senti_train_data = new_senti_train_data[['Rating', 'Review_Text', 'usability', 'price', 'size', 'service', 'quality','Snorkel_Sentiment_Labels','Aspect_Terms']]
new_senti_train_data.columns
new_senti_train_data.to_csv('new_senti_train_data.csv', index=False)


In [None]:
print(files)
new_senti_train_data.columns

['base_paper1.pdf', 'archive', 'results', 'new_senti_train_data.csv', 'amazon_tagged_data.csv']


Index(['Rating', 'Review_Text', 'Usability', 'Price', 'Size', 'Service',
       'Quality', 'Aspect_Terms', 'Aspect_Sentiment', 'Aspect_Category'],
      dtype='object')

In [None]:
new_senti_train_data = pd.read_csv('/content/drive/My Drive/Proj_dataset/new_senti_train_data.csv',
                                  encoding='utf-8',  # or 'latin1', etc.
                                  sep=',')           # or ';', '\t', etc.

In [None]:
new_senti_train_data.head(3)

Unnamed: 0,Rating,Review_Text,usability,price,size,service,quality,Snorkel_Sentiment_Labels,Aspect_Terms
0,POSITIVE,peek a fun!: we got these as a gift for our on...,"fun, use",,high,,tolerate,positive,"moves,tolerate"
1,POSITIVE,great for the money: the picture on this camer...,,"price, money, Money",,,,positive,"money,price"
2,POSITIVE,easy and worth the price.: these took about 2 ...,"rotate, install","price, Worth the price",,"contact, arrived",,positive,"install,price,$"


Aspect Category

In [None]:
new_senti_train_data = new_senti_train_data.reset_index(drop=True)


# Initialize aspects with empty strings to match the length of data_filtered
aspects = [''] * len(new_senti_train_data)

for i in range(len(new_senti_train_data)):
    aspect_values = []
    for col in ['usability', 'price', 'size', 'service', 'quality']:
        if pd.notna(new_senti_train_data.loc[i, col]) and new_senti_train_data.loc[i, col] != "NotFound":
            aspect_values.append(col)

    if aspect_values:
        aspect_values.sort()
        asp = ','.join(aspect_values)
        aspects[i] = asp  # Update the aspects list at the correct index

new_senti_train_data['Aspect_Category'] = aspects
new_senti_train_data.head(5)

Unnamed: 0,Rating,Review_Text,usability,price,size,service,quality,Snorkel_Sentiment_Labels,Aspect_Terms,Aspect_Category
0,POSITIVE,peek a fun!: we got these as a gift for our on...,"fun, use",,high,,tolerate,positive,"moves,tolerate","quality,size,usability"
1,POSITIVE,great for the money: the picture on this camer...,,"price, money, Money",,,,positive,"money,price",price
2,POSITIVE,easy and worth the price.: these took about 2 ...,"rotate, install","price, Worth the price",,"contact, arrived",,positive,"install,price,$","price,service,usability"
3,POSITIVE,awsome!!: really a life saver when it comes to...,work,"money, Money",,,,positive,"work,money","price,usability"
4,POSITIVE,"""super"" fun: rated for 3+ but my 2 and 3 yr. o...","using, making, playing, fun, play","cheap, Cheap",,box,"plastic, Cheap",positive,"playing,using,cheap,$,box,plastic","price,quality,service,usability"


In [None]:
new_senti_train_data.columns

Index(['Rating', 'Review_Text', 'usability', 'price', 'size', 'service',
       'quality', 'Snorkel_Sentiment_Labels', 'Aspect_Terms',
       'Aspect_Category'],
      dtype='object')

In [None]:
new_senti_train_data.head(3)

Unnamed: 0,Rating,Review_Text,usability,price,size,service,quality,Snorkel_Sentiment_Labels,Aspect_Terms,Aspect_Category
0,POSITIVE,peek a fun!: we got these as a gift for our on...,"fun, use",,high,,tolerate,positive,"moves,tolerate","quality,size,usability"
1,POSITIVE,great for the money: the picture on this camer...,,"price, money, Money",,,,positive,"money,price",price
2,POSITIVE,easy and worth the price.: these took about 2 ...,"rotate, install","price, Worth the price",,"contact, arrived",,positive,"install,price,$","price,service,usability"


Load saved data

In [None]:
!pip install snorkel



In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from transformers import pipeline

# Download necessary NLTK resources
nltk.download('vader_lexicon')
nltk.download('sentiwordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Constants for Sentiment Labels in lowercase
POSITIVE = 'positive'
NEGATIVE = 'negative'
NEUTRAL = 'neutral'

# Initialize VADER and BERT sentiment analyzer
sia = SentimentIntensityAnalyzer()
sentiment_analyzer = pipeline("sentiment-analysis")


# Function to get WordNet POS tag for SentiWordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    return None

# SentiWordNet-based Sentiment Scoring
def get_sentiwordnet_score(word, pos_tag):
    synsets = list(swn.senti_synsets(word, pos_tag))
    if not synsets:
        return 0  # Neutral if no matching synset
    pos_score = sum([syn.pos_score() for syn in synsets]) / len(synsets)
    neg_score = sum([syn.neg_score() for syn in synsets]) / len(synsets)
    return pos_score - neg_score  # Positive - Negative score

# Combined Labeling Function with Transformer Scoring and Tuned Thresholds
def combined_aspect_review_sentiment(x):
    review = x.get('Review_Text', '')
    rating = x.get('Rating', '').lower()  # Use lowercase for consistency

    # Initialize list for aspect-specific scores
    aspect_scores = []

    # Check each aspect and match with keywords
    for aspect, keywords in five_aspect_keywords.items():
        if pd.isna(x.get(aspect)):
            continue

        tokens = word_tokenize(review)
        tagged_tokens = pos_tag(tokens)

        # Check for presence of aspect keywords and get SentiWordNet scores
        for keyword in keywords:
            if keyword.lower() in review.lower():
                for word, tag in tagged_tokens:
                    if keyword.lower() in word.lower():
                        wn_tag = get_wordnet_pos(tag)
                        if wn_tag:
                            score = get_sentiwordnet_score(word, wn_tag)
                            aspect_scores.append(score)

    # Calculate average SentiWordNet score if any aspect terms matched
    avg_sentiwordnet_score = np.mean(aspect_scores) if aspect_scores else 0

    # VADER sentiment score for review
    vader_score = sia.polarity_scores(review)['compound']

    # BERT sentiment score
    bert_result = sentiment_analyzer(review)[0]
    bert_score = 1 if bert_result['label'] == 'POSITIVE' else -1

    # Combined sentiment score: a weighted combination of SentiWordNet, VADER, and BERT scores
    combined_score = (avg_sentiwordnet_score * 0.45) + (vader_score * 0.3) + (bert_score * 0.25)

       # Ensemble-based decision-making
    if combined_score > 0.25 or (vader_score > 0.5 and bert_score == 1):
        return POSITIVE
    elif combined_score < -0.25 or (vader_score < -0.5 and bert_score == -1):
        return NEGATIVE
    else:
        return NEUTRAL  # Default to NEUTRAL if no conditions are met

# Apply Labeling to Dataset
def apply_labeling(data):
    data['sentiment_label'] = data.apply(combined_aspect_review_sentiment, axis=1)
    return data

# Example usage with new_senti_train_data DataFrame
# Ensure new_senti_train_data contains columns ['Review_Text', 'Rating', 'price', 'quality', 'size', 'service', 'usability']
new_senti_train_data = apply_labeling(new_senti_train_data)


# Display the DataFrame with the new sentiment labels
print(new_senti_train_data.head(3))




[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is avai

     Rating                                        Review_Text  \
0  POSITIVE  peek a fun!: we got these as a gift for our on...   
1  POSITIVE  great for the money: the picture on this camer...   
2  POSITIVE  easy and worth the price.: these took about 2 ...   

         usability                   price  size           service   quality  \
0         fun, use                     NaN  high               NaN  tolerate   
1              NaN     price, money, Money   NaN               NaN       NaN   
2  rotate, install  price, Worth the price   NaN  contact, arrived       NaN   

  Snorkel_Sentiment_Labels      Aspect_Terms          Aspect_Category  \
0                 positive  moves,tolerate     quality,size,usability   
1                 positive       money,price                    price   
2                 positive   install,price,$  price,service,usability   

  sentiment_label  
0         neutral  
1        positive  
2         neutral  


In [None]:
print('unique value of newalgo:',new_senti_train_data['sentiment_label'].unique())
new_senti_train_data['Snorkel_Sentiment_Labels'].unique()
print('unique of snorkel:',new_senti_train_data['Snorkel_Sentiment_Labels'].unique())

unique value of newalgo: ['neutral' 'positive' 'negative']
unique of snorkel: ['positive' 'negative' 'neutral']


In [None]:
 new_senti_train_data.columns

Index(['Rating', 'Review_Text', 'usability', 'price', 'size', 'service',
       'quality', 'Snorkel_Sentiment_Labels', 'Aspect_Terms',
       'Aspect_Category', 'sentiment_label'],
      dtype='object')

In [None]:
train_tagged_data.head(3)

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment,Aspect_Category
0,1,"I am loving the size of this shirt ,fits me well",NotFound,NotFound,"size,fits",NotFound,NotFound,"size,fits,",POSITIVE,Size
1,0,"The quality of this toy is very bad,it is broken",NotFound,NotFound,NotFound,NotFound,"quality,broken","quality,broken",NEGATIVE,Quality
2,0,"The manual instructions are very bad ,wish to ...",NotFound,NotFound,NotFound,"manual,instructions,contact,seller",NotFound,"manual,instructions,contact,seller",NEGATIVE,Service


In [None]:
train_tagged_data['Snorkel_Sentiment_Labels'].unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [None]:
num_differences = (new_senti_train_data['Snorkel_Sentiment_Labels'] != new_senti_train_data['sentiment_label']).sum()
print("Number of differences:", num_differences)

Number of differences: 49


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate and print the number of differences
num_differences = (new_senti_train_data['sentiment_label'] != new_senti_train_data['Snorkel_Sentiment_Labels']).sum()
print("Number of differences:", num_differences)

# Extract the true and predicted labels
y_true = new_senti_train_data['sentiment_label']
y_pred = new_senti_train_data['Snorkel_Sentiment_Labels']

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Display detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

Number of differences: 49
Accuracy: 0.85
Precision: 0.86
Recall: 0.85
F1 Score: 0.85

Classification Report:
              precision    recall  f1-score   support

    negative       0.97      0.90      0.93       118
     neutral       0.80      0.77      0.79       115
    positive       0.78      0.89      0.83        97

    accuracy                           0.85       330
   macro avg       0.85      0.85      0.85       330
weighted avg       0.86      0.85      0.85       330



In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from transformers import pipeline

# Download necessary NLTK resources
nltk.download('vader_lexicon')
nltk.download('sentiwordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Constants for Sentiment Labels in lowercase
POSITIVE = 'positive'
NEGATIVE = 'negative'
NEUTRAL = 'neutral'

# Initialize VADER and BERT sentiment analyzer
sia = SentimentIntensityAnalyzer()
sentiment_analyzer = pipeline("sentiment-analysis")

# Function to get WordNet POS tag for SentiWordNet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    return None

# SentiWordNet-based Sentiment Scoring
def get_sentiwordnet_score(word, pos_tag):
    synsets = list(swn.senti_synsets(word, pos_tag))
    if not synsets:
        return 0  # Neutral if no matching synset
    pos_score = sum([syn.pos_score() for syn in synsets]) / len(synsets)
    neg_score = sum([syn.neg_score() for syn in synsets]) / len(synsets)
    return pos_score - neg_score  # Positive - Negative score

# Improved Combined Labeling Function with Re-weighting and Voting
def combined_aspect_review_sentiment(x):
    review = x.get('Review_Text', '')
    rating = x.get('Rating', '').lower()  # Use lowercase for consistency

    # Initialize list for aspect-specific scores
    aspect_scores = []

    # Check each aspect and match with keywords
    for aspect, keywords in five_aspect_keywords.items():
        if pd.isna(x.get(aspect)):
            continue

        tokens = word_tokenize(review)
        tagged_tokens = pos_tag(tokens)

        # Check for presence of aspect keywords and get SentiWordNet scores
        for keyword in keywords:
            if keyword.lower() in review.lower():
                for word, tag in tagged_tokens:
                    if keyword.lower() in word.lower():
                        wn_tag = get_wordnet_pos(tag)
                        if wn_tag:
                            score = get_sentiwordnet_score(word, wn_tag)
                            aspect_scores.append(score)

    # Average SentiWordNet score for aspect matches
    avg_sentiwordnet_score = np.mean(aspect_scores) if aspect_scores else 0

    # VADER sentiment score for review
    vader_score = sia.polarity_scores(review)['compound']

    # BERT sentiment score
    bert_result = sentiment_analyzer(review)[0]
    bert_score = 1 if bert_result['label'] == 'POSITIVE' else -1

    # Re-weighted combined sentiment score
    combined_score = (avg_sentiwordnet_score * 0.4) + (vader_score * 0.35) + (bert_score * 0.25)

    # Ensemble-based decision-making
    if combined_score > 0.25 or (vader_score > 0.5 and bert_score == 1):
        return POSITIVE
    elif combined_score < -0.25 or (vader_score < -0.5 and bert_score == -1):
        return NEGATIVE
    else:
        return NEUTRAL  # Default to NEUTRAL if no conditions are met

# Apply Labeling to Dataset
def apply_labeling(data):
    data['sentiment_label1'] = data.apply(combined_aspect_review_sentiment, axis=1)
    return data

# Example usage with new_senti_train_data DataFrame
new_senti_train_data = apply_labeling(new_senti_train_data)

# Display the DataFrame with the new sentiment labels
print(new_senti_train_data.head(3))


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is avai

     Rating                                        Review_Text  \
0  POSITIVE  peek a fun!: we got these as a gift for our on...   
1  POSITIVE  great for the money: the picture on this camer...   
2  POSITIVE  easy and worth the price.: these took about 2 ...   

         usability                   price  size           service   quality  \
0         fun, use                     NaN  high               NaN  tolerate   
1              NaN     price, money, Money   NaN               NaN       NaN   
2  rotate, install  price, Worth the price   NaN  contact, arrived       NaN   

  Snorkel_Sentiment_Labels      Aspect_Terms          Aspect_Category  \
0                 positive  moves,tolerate     quality,size,usability   
1                 positive       money,price                    price   
2                 positive   install,price,$  price,service,usability   

  sentiment_label sentiment_label1  
0         neutral          neutral  
1        positive         positive  
2         

In [None]:
num_differences = (train_tagged_data['Snorkel_Sentiment_Labels'] != new_senti_train_data['sentiment_label1']).sum()
print("Number of differences:", num_differences)

Number of differences: 51


In [None]:
num_differences = (new_senti_train_data['Snorkel_Sentiment_Labels'] != new_senti_train_data['sentiment_label1']).sum()
print("Number of differences:", num_differences)

Number of differences: 51


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate and print the number of differences
num_differences = (new_senti_train_data['sentiment_label1'] != train_tagged_data['Snorkel_Sentiment_Labels']).sum()
print("Number of differences:", num_differences)

# Extract the true and predicted labels
y_true = new_senti_train_data['sentiment_label1']
y_pred = train_tagged_data['Snorkel_Sentiment_Labels']

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
# Cohen's Kappa
kappa = cohen_kappa_score(new_senti_train_data['sentiment_label1'], train_tagged_data['Snorkel_Sentiment_Labels'])
print(f"Cohen's Kappa: {kappa}")
# Display detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

Number of differences: 51
Accuracy: 0.85
Precision: 0.85
Recall: 0.85
F1 Score: 0.85
Cohen's Kappa: 0.7681945898297615

Classification Report:
              precision    recall  f1-score   support

    negative       0.97      0.90      0.93       118
     neutral       0.78      0.76      0.77       114
    positive       0.78      0.88      0.83        98

    accuracy                           0.85       330
   macro avg       0.85      0.85      0.84       330
weighted avg       0.85      0.85      0.85       330



In [None]:
new_senti_train_data.columns

Index(['Rating', 'Review_Text', 'usability', 'price', 'size', 'service',
       'quality', 'Snorkel_Sentiment_Labels', 'Aspect_Terms',
       'Aspect_Category'],
      dtype='object')

In [None]:
new_senti_train_data['Rating'] = new_senti_train_data['Rating'].map({'POSITIVE': 1, 'NEGATIVE': 0})


In [None]:
new_senti_train_data['Rating'].unique()
print(new_senti_train_data.head(3))

   Rating                                        Review_Text        usability  \
0       1  peek a fun!: we got these as a gift for our on...         fun, use   
1       1  great for the money: the picture on this camer...              NaN   
2       1  easy and worth the price.: these took about 2 ...  rotate, install   

                    price  size           service   quality  \
0                     NaN  high               NaN  tolerate   
1     price, money, Money   NaN               NaN       NaN   
2  price, Worth the price   NaN  contact, arrived       NaN   

  Snorkel_Sentiment_Labels      Aspect_Terms          Aspect_Category  
0                 positive  moves,tolerate     quality,size,usability  
1                 positive       money,price                    price  
2                 positive   install,price,$  price,service,usability  


Reinforcement learning for weight with respect to aspect terms

In [None]:
new_senti_train_data.head(3)

Unnamed: 0,Rating,Review_Text,usability,price,size,service,quality,Snorkel_Sentiment_Labels,Aspect_Terms,Aspect_Category
0,1,peek a fun!: we got these as a gift for our on...,"fun, use",,high,,tolerate,positive,"moves,tolerate","quality,size,usability"
1,1,great for the money: the picture on this camer...,,"price, money, Money",,,,positive,"money,price",price
2,1,easy and worth the price.: these took about 2 ...,"rotate, install","price, Worth the price",,"contact, arrived",,positive,"install,price,$","price,service,usability"


Bi-LSTM Model

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Constants
max_review_length = 100  # Maximum length of review text
vocab_size = 5000        # Vocabulary size
embedding_dim = 100      # Dimension of embedding layer
aspect_lexicon_size = 10 # Number of features for aspect terms
hidden_units = 64        # Number of hidden units in BiLSTM layer

reviews = new_senti_train_data['Review_Text']
aspect_terms = new_senti_train_data['Aspect_Terms']
ratings = new_senti_train_data['Rating']
labels = new_senti_train_data['Snorkel_Sentiment_Labels']

# Step 1: Tokenize the 'Review_Text' column
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(reviews)
X_reviews = tokenizer.texts_to_sequences(reviews)
X_reviews = pad_sequences(X_reviews, maxlen=max_review_length)

# Step 2: Tokenize 'Aspect_Terms' (if necessary, or use them as features directly)
tokenizer_aspects = Tokenizer(num_words=vocab_size)
tokenizer_aspects.fit_on_texts(aspect_terms)
X_aspects = tokenizer_aspects.texts_to_sequences(aspect_terms)
X_aspects = pad_sequences(X_aspects, maxlen=aspect_lexicon_size)

# Step 3: Encode the labels (Snorkel_Sentiment_Labels)
label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform(labels)
y_labels = tf.keras.utils.to_categorical(y_labels, num_classes=3)  # If 3 classes: positive, negative, neutral

# Step 4: Define Inputs for the BiLSTM Model
review_text_input = Input(shape=(max_review_length,), dtype='int32', name="review_text")
aspect_lexicon_input = Input(shape=(aspect_lexicon_size,), name="aspect_lexicon")
rating_input = Input(shape=(1,), name="rating")

# Embedding layer for review text
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_review_length)(review_text_input)

# BiLSTM layer
lstm_out = Bidirectional(LSTM(units=hidden_units, return_sequences=False))(embedding)

# Concatenate BiLSTM output with aspect lexicon and rating inputs
concatenated = Concatenate()([lstm_out, aspect_lexicon_input, rating_input])

# Dense layer for final sentiment classification
dense_output = Dense(units=3, activation='softmax')(concatenated)  # 3 classes for sentiment (positive, negative, neutral)

# Define model
model = Model(inputs=[review_text_input, aspect_lexicon_input, rating_input], outputs=dense_output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
model.fit([X_reviews, X_aspects, ratings], y_labels, epochs=15, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Model Summary
model.summary()





Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 review_text (InputLayer)    [(None, 100)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 100, 100)             500000    ['review_text[0][0]']         
                                                                                                  
 bidirectional_1 (Bidirecti  (None, 128)                  84480     ['embedding_1[0][0]']         
 onal)                                                                                            
                    

# TRANSFORMER BASED MODEL

In [None]:
pip install tensorflow transformers




Tensorflow model

In [None]:
test_tagged_data.head(5)

Unnamed: 0,Rating,Review_Text,Usability,Price,Size,Service,Quality,Aspect_Terms,Aspect_Sentiment,Aspect_Category
400,0,exercise outside: it was my mistake. i didn't ...,NotFound,NotFound,fit,"return,packing",NotFound,"fit,return,packing",NEGATIVE,"Service,Size"
401,1,very comfortable: it was a little difficult to...,comfortable,NotFound,sizing,NotFound,cotton,"comfortable,sizing,cotton",MIXED,"Quality,Size,Usability"
402,1,a little bit big..: i love soffe shorts and we...,wear,NotFound,"big,bigger,size",NotFound,NotFound,"wear,big,bigger,size",POSITIVE,"Size,Usability"
403,0,watch out for a fake!: if you are really serio...,NotFound,NotFound,NotFound,"fake,seller",NotFound,"$,fake,seller",NEGATIVE,Service
404,1,perfection!: i've always used stant as replace...,NotFound,price,NotFound,NotFound,NotFound,price,POSITIVE,Price


In [None]:
import numpy as np
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Dense, Concatenate, Flatten
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder

# Constants
max_review_length = 100  # Maximum length for BERT tokenizer
aspect_lexicon_size = 10  # Max length for aspect terms (if tokenized separately)
bert_model_name = "bert-base-uncased"

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = TFBertModel.from_pretrained(bert_model_name)

# Dataset columns
reviews = new_senti_train_data['Review_Text']
aspect_terms = new_senti_train_data['Aspect_Terms']
aspect_category = new_senti_train_data['Aspect_Category']
ratings = new_senti_train_data['Rating']  # Binary ratings
labels = new_senti_train_data['Snorkel_Sentiment_Labels']

# Step 1: Tokenize `Review_Text` for BERT
encoded_reviews = tokenizer(
    list(reviews),
    max_length=max_review_length,
    padding='max_length',
    truncation=True,
    return_tensors="tf"
)

# Step 2: Tokenize or Encode `Aspect_Terms`
aspect_terms_input = tokenizer(
    list(aspect_terms),
    max_length=aspect_lexicon_size,
    padding='max_length',
    truncation=True,
    return_tensors="tf"
)['input_ids']  # Use input IDs for aspect terms

# Step 3: Encode `Aspect_Category`
label_encoder_category = LabelEncoder()
aspect_category_encoded = label_encoder_category.fit_transform(aspect_category)

# Step 4: Encode Labels (`Snorkel_Sentiment_Labels`)
label_encoder_sentiment = LabelEncoder()
y_labels = label_encoder_sentiment.fit_transform(labels)
y_labels = tf.keras.utils.to_categorical(y_labels, num_classes=3)  # Positive, Neutral, Negative

# Step 5: Process `Ratings`
ratings = np.array(ratings).reshape(-1, 1)  # Reshape for input layer

# Inputs for the Model
review_input = Input(shape=(max_review_length,), dtype=tf.int32, name="review_input")
aspect_terms_input_layer = Input(shape=(aspect_lexicon_size,), dtype=tf.int32, name="aspect_terms_input")
aspect_category_input = Input(shape=(1,), dtype=tf.int32, name="aspect_category_input")
ratings_input = Input(shape=(1,), dtype=tf.float32, name="ratings_input")

# BERT Embeddings for `Review_Text`
bert_output = bert_model(review_input).pooler_output  # Use pooled output for classification

# Dense Layers for `Aspect_Terms` and `Aspect_Category`
aspect_terms_dense = Dense(32, activation='relu')(Flatten()(aspect_terms_input_layer))
aspect_category_dense = Dense(16, activation='relu')(aspect_category_input)

# Combine Features
concatenated = Concatenate()([bert_output, aspect_terms_dense, aspect_category_dense, ratings_input])

# Classification Head
dense_output = Dense(64, activation='relu')(concatenated)
final_output = Dense(3, activation='softmax', name="sentiment_output")(dense_output)

# Define Model
model = Model(
    inputs=[review_input, aspect_terms_input_layer, aspect_category_input, ratings_input],
    outputs=final_output
)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the Model
history = model.fit(
    [encoded_reviews['input_ids'], aspect_terms_input, aspect_category_encoded, ratings],
    y_labels,
    epochs=3,
    batch_size=16,
    validation_split=0.2
)

# Model Summary
model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

TypeError: Exception encountered when calling layer 'embeddings' (type TFBertEmbeddings).

Could not build a TypeSpec for name: "tf.debugging.assert_less_1/assert_less/Assert/Assert"
op: "Assert"
input: "tf.debugging.assert_less_1/assert_less/All"
input: "tf.debugging.assert_less_1/assert_less/Assert/Assert/data_0"
input: "tf.debugging.assert_less_1/assert_less/Assert/Assert/data_1"
input: "tf.debugging.assert_less_1/assert_less/Assert/Assert/data_2"
input: "Placeholder"
input: "tf.debugging.assert_less_1/assert_less/Assert/Assert/data_4"
input: "tf.debugging.assert_less_1/assert_less/y"
attr {
  key: "summarize"
  value {
    i: 3
  }
}
attr {
  key: "T"
  value {
    list {
      type: DT_STRING
      type: DT_STRING
      type: DT_STRING
      type: DT_INT32
      type: DT_STRING
      type: DT_INT32
    }
  }
}
 of unsupported type <class 'tensorflow.python.framework.ops.Operation'>.

Call arguments received by layer 'embeddings' (type TFBertEmbeddings):
  • input_ids=<KerasTensor: shape=(None, 100) dtype=int32 (created by layer 'review_input')>
  • position_ids=None
  • token_type_ids=<KerasTensor: shape=(None, 100) dtype=int32 (created by layer 'tf.fill_2')>
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False

PREPROCESSING

Compile and train

Evaluation