# VADER

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('vader_lexicon')
%matplotlib inline
import warnings
from nltk.sentiment.vader import SentimentIntensityAnalyzer

warnings.filterwarnings("ignore")

import os

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [75]:
vader_df = pd.read_csv('lUDozITit6w_Hfiltered.csv', error_bad_lines=False)

In [76]:
vader_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,pixel a at after card discount so are you in...,309718_1
1,1,i am in group looking forward for sale price,965018_1
2,2,camera bar is not aluminium,20467_1
3,3,can somebody tell me if this phone has heating...,835314_1
4,4,group,741056_1
...,...,...,...
1016,1016,so you upload videos late night,410410_1
1017,1017,yo,812443_1
1018,1018,first,837435_1
1019,1019,first,140084_1


In [77]:
vader_df['Comments'] = vader_df['Comments'].astype(str)
vader_df['Comments'] = vader_df['Comments'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
vader_df['Comments'] = vader_df['Comments'].apply(lambda x:x.lower())

tokenized_tweet = vader_df['Comments'].apply(lambda x: x.split())

wnl = WordNetLemmatizer()

tokenized_tweet.apply(lambda x: [wnl.lemmatize(i) for i in x if i not in set(stopwords.words('english'))]) 
tokenized_tweet.head()

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
    
vader_df['Comments'] = tokenized_tweet

sia = SentimentIntensityAnalyzer()
vader_df['Sentiment Scores'] = vader_df['Comments'].apply(lambda x:sia.polarity_scores(x)['compound'])
vader_df['Sentiment'] = vader_df['Sentiment Scores'].apply(lambda s : 'Positive' if s > 0 else ('Neutral' if s == 0 else 'Negative'))

In [78]:
vader_df.Sentiment.value_counts()

Neutral     531
Positive    359
Negative    131
Name: Sentiment, dtype: int64

In [79]:
vader_percentages = vader_df['Sentiment'].value_counts(normalize=True) * 100
vader_percentages

Neutral     52.007835
Positive    35.161606
Negative    12.830558
Name: Sentiment, dtype: float64

In [80]:
vader_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment Scores,Sentiment
0,0,pixel after card discount group group,309718_1,0.0,Neutral
1,1,group looking forward sale price,965018_1,0.0,Neutral
2,2,camera aluminium,20467_1,0.0,Neutral
3,3,somebody tell this phone heating issues,835314_1,0.0,Neutral
4,4,group,741056_1,0.0,Neutral
...,...,...,...,...,...
1016,1016,upload videos late night,410410_1,0.0,Neutral
1017,1017,,812443_1,0.0,Neutral
1018,1018,first,837435_1,0.0,Neutral
1019,1019,first,140084_1,0.0,Neutral


# Text Blob

In [81]:
from textblob import TextBlob
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

textBlob_df = pd.read_csv('lUDozITit6w_Hfiltered.csv')
textBlob_df['Comments'] = textBlob_df['Comments'].astype(str)
textBlob_df['Sentiment Scores'] = ''
textBlob_df['Sentiment'] = ''

def preprocess_text(text):
    text = text.lower()    
    text = text.translate(str.maketrans('', '', string.punctuation))    
    tokens = text.split()

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

for index, row in textBlob_df.iterrows():
    comment = row['Comments']    
    preprocessed_comment = preprocess_text(comment)

    blob = TextBlob(preprocessed_comment)
    polarity = blob.sentiment.polarity

    textBlob_df.at[index, 'Sentiment Scores'] = polarity

    if polarity > 0:
        textBlob_df.at[index, 'Sentiment'] = 'Positive'
    elif polarity < 0:
        textBlob_df.at[index, 'Sentiment'] = 'Negative'
    else:
        textBlob_df.at[index, 'Sentiment'] = 'Neutral'


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
textBlob_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment Scores,Sentiment
0,0,pixel a at after card discount so are you in...,309718_1,0.0,Neutral
1,1,i am in group looking forward for sale price,965018_1,0.0,Neutral
2,2,camera bar is not aluminium,20467_1,0.0,Neutral
3,3,can somebody tell me if this phone has heating...,835314_1,0.0,Neutral
4,4,group,741056_1,0.0,Neutral
...,...,...,...,...,...
1016,1016,so you upload videos late night,410410_1,-0.3,Negative
1017,1017,yo,812443_1,0.0,Neutral
1018,1018,first,837435_1,0.25,Positive
1019,1019,first,140084_1,0.25,Positive


In [83]:
textBlob_df.Sentiment.value_counts()

Neutral     587
Positive    325
Negative    109
Name: Sentiment, dtype: int64

In [84]:
textBlob_percentages = textBlob_df['Sentiment'].value_counts(normalize=True) * 100
textBlob_percentages

Neutral     57.492654
Positive    31.831538
Negative    10.675808
Name: Sentiment, dtype: float64

# AFINN Model

In [85]:
import pandas as pd
from afinn import Afinn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

afinn_df = pd.read_csv("lUDozITit6w_Hfiltered.csv")

afinn_df['Comments'] = afinn_df['Comments'].astype(str)
afinn = Afinn()

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def get_sentiment(text):
    sentiment_score = afinn.score(text)
    if sentiment_score > 0:
        return "Positive"
    elif sentiment_score < 0:
        return "Negative"
    else:
        return "Neutral"

afinn_df["Sentiment"] = afinn_df["Comments"].apply(preprocess_text).apply(get_sentiment)
afinn_df["Sentiment Scores"] = afinn_df["Comments"].apply(preprocess_text).apply(afinn.score)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [86]:
afinn_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment,Sentiment Scores
0,0,pixel a at after card discount so are you in...,309718_1,Neutral,0.0
1,1,i am in group looking forward for sale price,965018_1,Neutral,0.0
2,2,camera bar is not aluminium,20467_1,Neutral,0.0
3,3,can somebody tell me if this phone has heating...,835314_1,Neutral,0.0
4,4,group,741056_1,Neutral,0.0
...,...,...,...,...,...
1016,1016,so you upload videos late night,410410_1,Neutral,0.0
1017,1017,yo,812443_1,Neutral,0.0
1018,1018,first,837435_1,Neutral,0.0
1019,1019,first,140084_1,Neutral,0.0


In [87]:
afinn_df.Sentiment.value_counts()

Neutral     528
Positive    354
Negative    139
Name: Sentiment, dtype: int64

In [88]:
afinn_percentages = afinn_df['Sentiment'].value_counts(normalize=True) * 100
afinn_percentages

Neutral     51.714006
Positive    34.671890
Negative    13.614104
Name: Sentiment, dtype: float64

# RoBERTa Model New

In [147]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
import string


def preprocess_text(text):
    text = text.lower()    
    text = text.translate(str.maketrans('', '', string.punctuation))    
    tokens = text.split()

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

task = 'sentiment'
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

robert_df = pd.read_csv("lUDozITit6w_Hfiltered.csv")
robert_df['Comments'] = robert_df['Comments'].astype(str)
robert_df['Sentiment Scores'] = ''
robert_df['Sentiment'] = ''

for index, row in robert_df.iterrows():
    comment = row['Comments']
    preprocessed_comment = preprocess_text(comment)

    encoded_input = tokenizer(preprocessed_comment, return_tensors='pt')
    output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        if i == 0:
            robert_df.at[index, 'Sentiment Scores'] = np.round(float(s), 4)
            robert_df.at[index, 'Sentiment'] = l.title()
            break

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [148]:
robert_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment Scores,Sentiment
0,0,pixel a at after card discount so are you in...,309718_1,0.6674,Neutral
1,1,i am in group looking forward for sale price,965018_1,0.6605,Neutral
2,2,camera bar is not aluminium,20467_1,0.7872,Neutral
3,3,can somebody tell me if this phone has heating...,835314_1,0.5455,Negative
4,4,group,741056_1,0.567,Neutral
...,...,...,...,...,...
1016,1016,so you upload videos late night,410410_1,0.8773,Neutral
1017,1017,yo,812443_1,0.4953,Neutral
1018,1018,first,837435_1,0.5086,Neutral
1019,1019,first,140084_1,0.5086,Neutral


In [149]:
robert_df.Sentiment.value_counts()

Neutral     685
Positive    174
Negative    162
Name: Sentiment, dtype: int64

In [150]:
robert_percentages = robert_df['Sentiment'].value_counts(normalize=True) * 100
robert_percentages

Neutral     67.091087
Positive    17.042116
Negative    15.866797
Name: Sentiment, dtype: float64

# RoBERTa Model Old

In [184]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
import string


def preprocess_text(text):
    text = text.lower()    
    text = text.translate(str.maketrans('', '', string.punctuation))    
    tokens = text.split()

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

task = 'sentiment'
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

robertold_df = pd.read_csv("lUDozITit6w_Hfiltered.csv")
robertold_df['Comments'] = robertold_df['Comments'].astype(str)
robertold_df['Sentiment Scores'] = ''
robertold_df['Sentiment'] = ''

for index, row in robert_df.iterrows():
    comment = row['Comments']
    preprocessed_comment = preprocess_text(comment)

    encoded_input = tokenizer(preprocessed_comment, return_tensors='pt')
    output = model(**encoded_input)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        if i == 0:
            robertold_df.at[index, 'Sentiment Scores'] = np.round(float(s), 4)
            robertold_df.at[index, 'Sentiment'] = l.title()
            break

In [185]:
robertold_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment Scores,Sentiment
0,0,pixel a at after card discount so are you in...,309718_1,0.7834,Neutral
1,1,i am in group looking forward for sale price,965018_1,0.7834,Neutral
2,2,camera bar is not aluminium,20467_1,0.7834,Neutral
3,3,can somebody tell me if this phone has heating...,835314_1,0.7834,Neutral
4,4,group,741056_1,0.7834,Neutral
...,...,...,...,...,...
1016,1016,so you upload videos late night,410410_1,0.7834,Neutral
1017,1017,yo,812443_1,0.7834,Neutral
1018,1018,first,837435_1,0.7834,Neutral
1019,1019,first,140084_1,0.7834,Neutral


In [186]:
robertold_df.Sentiment.value_counts()

Neutral    1021
Name: Sentiment, dtype: int64

In [187]:
robert_percentages = robertold_df['Sentiment'].value_counts(normalize=True) * 100
robert_percentages

Neutral    100.0
Name: Sentiment, dtype: float64

# Voting and Finalising +ve,-ve and neutral comments

In [170]:
comments = pd.read_csv('lUDozITit6w_Hfiltered.csv', error_bad_lines=False)
comment_ids = list(comments['Comment ID'].to_list())

In [171]:
vote_df = pd.DataFrame()
for comment_id in comment_ids:
    
    pos_count = 0
    neg_count = 0
    neu_count = 0
    
    vader_sentimet_pred = vader_df.loc[vader_df['Comment ID'] == comment_id, 'Sentiment'].values 
    pos_count += sum(vader_sentimet_pred == 'Positive')
    neg_count += sum(vader_sentimet_pred == 'Negative')
    neu_count += sum(vader_sentimet_pred == 'Neutral')
    
    textblob_sentimet_pred = textBlob_df.loc[textBlob_df['Comment ID'] == comment_id, 'Sentiment'].values
    pos_count += sum(textblob_sentimet_pred == 'Positive')
    neg_count += sum(textblob_sentimet_pred == 'Negative')
    neu_count += sum(textblob_sentimet_pred == 'Neutral')
    
    afinn_sentimet_pred = afinn_df.loc[afinn_df['Comment ID'] == comment_id, 'Sentiment'].values
    pos_count += sum(afinn_sentimet_pred == 'Positive')
    neg_count += sum(afinn_sentimet_pred == 'Negative')
    neu_count += sum(afinn_sentimet_pred == 'Neutral')
    
    robert_sentimet_pred = robert_df.loc[robert_df['Comment ID'] == comment_id, 'Sentiment'].values
    pos_count += sum(robert_sentimet_pred == 'Positive')
    neg_count += sum(robert_sentimet_pred == 'Negative')
    neu_count += sum(robert_sentimet_pred == 'Neutral')
    
    robertold_sentimet_pred = robertold_df.loc[robertold_df['Comment ID'] == comment_id, 'Sentiment'].values
    pos_count += sum(robertold_sentimet_pred == 'Positive')
    neg_count += sum(robertold_sentimet_pred == 'Negative')
    neu_count += sum(robertold_sentimet_pred == 'Neutral')
    
    vote_df = vote_df.append({'Comment ID': comment_id, 'Positive': pos_count, 'Negative': neg_count, 'Neutral': neu_count}, ignore_index=True)

vote_df = vote_df.fillna(0)

vote_df['Positive'] = vote_df['Positive'].astype(int)
vote_df['Negative'] = vote_df['Negative'].astype(int)
vote_df['Neutral'] = vote_df['Neutral'].astype(int)


In [172]:
vote_df

Unnamed: 0,Comment ID,Positive,Negative,Neutral
0,309718_1,0,0,5
1,965018_1,1,0,4
2,20467_1,0,0,5
3,835314_1,0,2,3
4,741056_1,0,0,5
...,...,...,...,...
1016,410410_1,0,1,4
1017,812443_1,0,0,5
1018,837435_1,1,0,4
1019,140084_1,1,0,4


In [173]:
max_sentiment = vote_df[['Positive', 'Negative', 'Neutral']].idxmax(axis=1)

# Split the dataframe into separate dataframes based on the maximum sentiment
positive_df = vote_df[max_sentiment == 'Positive'][['Comment ID']]
negative_df = vote_df[max_sentiment == 'Negative'][['Comment ID']]
neutral_df = vote_df[max_sentiment == 'Neutral'][['Comment ID']]

In [174]:
positive_df

Unnamed: 0,Comment ID
6,143282_1
12,257319_1
14,324978_1
16,229423_1
21,537090_1
...,...
1005,281460_1
1007,435537_1
1008,741909_1
1012,145288_1


In [175]:
neutral_df

Unnamed: 0,Comment ID
0,309718_1
1,965018_1
2,20467_1
3,835314_1
4,741056_1
...,...
1016,410410_1
1017,812443_1
1018,837435_1
1019,140084_1


In [176]:
negative_df

Unnamed: 0,Comment ID
10,111840_1
22,580487_1
25,881486_1
26,381478_1
27,632635_1
...,...
960,219172_1
962,845508_1
979,308918_1
982,424473_1


# Mapping 

In [177]:
master = pd.read_csv("HighLevel_lUDozITit6w_master.csv", encoding='utf-8')

In [178]:
master

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,"Pixel 7a at ₹43,999 (₹39,999 after card discou...",309718_1
1,1,I am in group 1.5 Looking forward for sale price,965018_1
2,2,Camera bar is not aluminium,20467_1
3,3,Can somebody tell me if this phone has heating...,835314_1
4,4,Group 2,741056_1
...,...,...,...
1016,1016,So you upload videos late night,410410_1
1017,1017,Yo 1,812443_1
1018,1018,First,837435_1
1019,1019,First,140084_1


In [179]:
postive_comments = pd.merge(master, positive_df, on='Comment ID', how='inner')
postive_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,6,Because it's not true.,143282_1
1,12,I wanna buy a new phone. But my vivo that i bo...,257319_1
2,14,costs at least 15k more than my current almost...,324978_1
3,16,Here im waiting for BBD untill then i am lovin...,229423_1
4,21,"I would buy it in BBD , honestly now I see no ...",537090_1
...,...,...,...
272,1005,Nice video sir,281460_1
273,1007,Beebom also fixed their videos and are finally...,435537_1
274,1008,Nice improvement 😸..but little expensive,741909_1
275,1012,Pixel is perfect ❤,145288_1


In [180]:
negative_comments = pd.merge(master, negative_df, on='Comment ID', how='inner')
negative_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,10,"Hey Rupesh, \r\nI own a OnePlus 8t and had a v...",111840_1
1,22,5:29 wireless charging is limited to 7.5 watts...,580487_1
2,25,"ghanta still heating issue, bad performanc, ba...",881486_1
3,26,Shame no SD card slot 💳 😭,381478_1
4,27,"Overheating phone guys , don't believe these b...",632635_1
...,...,...,...
97,960,Pixel 7 is already at 44k on Amazon 🤣\nWhat is...,219172_1
98,962,I would be stupid big time to buy this at 45k ...,845508_1
99,979,"""Google is the worst company on Earth at keepi...",308918_1
100,982,What is this stupid obsession with glass backs...,424473_1


In [181]:
neutral_comments = pd.merge(master, neutral_df, on='Comment ID', how='inner')
neutral_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,"Pixel 7a at ₹43,999 (₹39,999 after card discou...",309718_1
1,1,I am in group 1.5 Looking forward for sale price,965018_1
2,2,Camera bar is not aluminium,20467_1
3,3,Can somebody tell me if this phone has heating...,835314_1
4,4,Group 2,741056_1
...,...,...,...
637,1016,So you upload videos late night,410410_1
638,1017,Yo 1,812443_1
639,1018,First,837435_1
640,1019,First,140084_1


In [182]:
import pandas as pd


sentiment_df = vote_df[['Comment ID', 'Positive', 'Negative', 'Neutral']].copy()
sentiment_df['Sentiment'] = sentiment_df[['Positive', 'Negative', 'Neutral']].idxmax(axis=1)

merged_df = pd.merge(master, sentiment_df, left_on='Comment ID', right_on='Comment ID', how='left')
new_df = merged_df[['Comment ID', 'Comments', 'Sentiment']].copy()


In [183]:
new_df.Sentiment.value_counts()

Neutral     642
Positive    277
Negative    102
Name: Sentiment, dtype: int64