# VADER

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('vader_lexicon')
%matplotlib inline
import warnings
from nltk.sentiment.vader import SentimentIntensityAnalyzer

warnings.filterwarnings("ignore")

import os

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [71]:
vader_df = pd.read_csv('lUDozITit6w_Hfiltered.csv', error_bad_lines=False)

In [72]:
vader_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,pixel a at after card discount so are you in...,309718_1
1,1,i am in group looking forward for sale price,965018_1
2,2,camera bar is not aluminium,20467_1
3,3,can somebody tell me if this phone has heating...,835314_1
4,4,group,741056_1
...,...,...,...
1016,1016,so you upload videos late night,410410_1
1017,1017,yo,812443_1
1018,1018,first,837435_1
1019,1019,first,140084_1


In [73]:
vader_df['Comments'] = vader_df['Comments'].astype(str)
vader_df['Comments'] = vader_df['Comments'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
vader_df['Comments'] = vader_df['Comments'].apply(lambda x:x.lower())

tokenized_tweet = vader_df['Comments'].apply(lambda x: x.split())

wnl = WordNetLemmatizer()

tokenized_tweet.apply(lambda x: [wnl.lemmatize(i) for i in x if i not in set(stopwords.words('english'))]) 
tokenized_tweet.head()

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
    
vader_df['Comments'] = tokenized_tweet

sia = SentimentIntensityAnalyzer()
vader_df['Sentiment Scores'] = vader_df['Comments'].apply(lambda x:sia.polarity_scores(x)['compound'])
vader_df['Sentiment'] = vader_df['Sentiment Scores'].apply(lambda s : 'Positive' if s > 0 else ('Neutral' if s == 0 else 'Negative'))

In [74]:
vader_df.Sentiment.value_counts()

Neutral     531
Positive    359
Negative    131
Name: Sentiment, dtype: int64

In [75]:
vader_percentages = vader_df['Sentiment'].value_counts(normalize=True) * 100
vader_percentages

Neutral     52.007835
Positive    35.161606
Negative    12.830558
Name: Sentiment, dtype: float64

In [76]:
vader_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment Scores,Sentiment
0,0,pixel after card discount group group,309718_1,0.0,Neutral
1,1,group looking forward sale price,965018_1,0.0,Neutral
2,2,camera aluminium,20467_1,0.0,Neutral
3,3,somebody tell this phone heating issues,835314_1,0.0,Neutral
4,4,group,741056_1,0.0,Neutral
...,...,...,...,...,...
1016,1016,upload videos late night,410410_1,0.0,Neutral
1017,1017,,812443_1,0.0,Neutral
1018,1018,first,837435_1,0.0,Neutral
1019,1019,first,140084_1,0.0,Neutral


# Text Blob

In [101]:
from textblob import TextBlob
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

textBlob_df = pd.read_csv('lUDozITit6w_Hfiltered.csv')
textBlob_df['Comments'] = textBlob_df['Comments'].astype(str)
textBlob_df['Sentiment Scores'] = ''
textBlob_df['Sentiment'] = ''

def preprocess_text(text):
    text = text.lower()    
    text = text.translate(str.maketrans('', '', string.punctuation))    
    tokens = text.split()

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

for index, row in textBlob_df.iterrows():
    comment = row['Comments']    
    preprocessed_comment = preprocess_text(comment)

    blob = TextBlob(preprocessed_comment)
    polarity = blob.sentiment.polarity

    textBlob_df.at[index, 'Sentiment Scores'] = polarity

    if polarity > 0:
        textBlob_df.at[index, 'Sentiment'] = 'Positive'
    elif polarity < 0:
        textBlob_df.at[index, 'Sentiment'] = 'Negative'
    else:
        textBlob_df.at[index, 'Sentiment'] = 'Neutral'


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [102]:
textBlob_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment Scores,Sentiment
0,0,pixel a at after card discount so are you in...,309718_1,0.0,Neutral
1,1,i am in group looking forward for sale price,965018_1,0.0,Neutral
2,2,camera bar is not aluminium,20467_1,0.0,Neutral
3,3,can somebody tell me if this phone has heating...,835314_1,0.0,Neutral
4,4,group,741056_1,0.0,Neutral
...,...,...,...,...,...
1016,1016,so you upload videos late night,410410_1,-0.3,Negative
1017,1017,yo,812443_1,0.0,Neutral
1018,1018,first,837435_1,0.25,Positive
1019,1019,first,140084_1,0.25,Positive


In [103]:
textBlob_df.Sentiment.value_counts()

Neutral     587
Positive    325
Negative    109
Name: Sentiment, dtype: int64

In [104]:
textBlob_percentages = textBlob_df['Sentiment'].value_counts(normalize=True) * 100
textBlob_percentages

Neutral     57.492654
Positive    31.831538
Negative    10.675808
Name: Sentiment, dtype: float64

# AFINN Model

In [81]:
import pandas as pd
from afinn import Afinn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

afinn_df = pd.read_csv("lUDozITit6w_Hfiltered.csv")

afinn_df['Comments'] = afinn_df['Comments'].astype(str)
afinn = Afinn()

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def get_sentiment(text):
    sentiment_score = afinn.score(text)
    if sentiment_score > 0:
        return "Positive"
    elif sentiment_score < 0:
        return "Negative"
    else:
        return "Neutral"

afinn_df["Sentiment"] = afinn_df["Comments"].apply(preprocess_text).apply(get_sentiment)
afinn_df["Sentiment Scores"] = afinn_df["Comments"].apply(preprocess_text).apply(afinn.score)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [82]:
afinn_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment,Sentiment Scores
0,0,pixel a at after card discount so are you in...,309718_1,Neutral,0.0
1,1,i am in group looking forward for sale price,965018_1,Neutral,0.0
2,2,camera bar is not aluminium,20467_1,Neutral,0.0
3,3,can somebody tell me if this phone has heating...,835314_1,Neutral,0.0
4,4,group,741056_1,Neutral,0.0
...,...,...,...,...,...
1016,1016,so you upload videos late night,410410_1,Neutral,0.0
1017,1017,yo,812443_1,Neutral,0.0
1018,1018,first,837435_1,Neutral,0.0
1019,1019,first,140084_1,Neutral,0.0


In [83]:
afinn_df.Sentiment.value_counts()

Neutral     528
Positive    354
Negative    139
Name: Sentiment, dtype: int64

In [84]:
afinn_percentages = afinn_df['Sentiment'].value_counts(normalize=True) * 100
afinn_percentages

Neutral     51.714006
Positive    34.671890
Negative    13.614104
Name: Sentiment, dtype: float64

In [85]:
afinn_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment,Sentiment Scores
0,0,pixel a at after card discount so are you in...,309718_1,Neutral,0.0
1,1,i am in group looking forward for sale price,965018_1,Neutral,0.0
2,2,camera bar is not aluminium,20467_1,Neutral,0.0
3,3,can somebody tell me if this phone has heating...,835314_1,Neutral,0.0
4,4,group,741056_1,Neutral,0.0
...,...,...,...,...,...
1016,1016,so you upload videos late night,410410_1,Neutral,0.0
1017,1017,yo,812443_1,Neutral,0.0
1018,1018,first,837435_1,Neutral,0.0
1019,1019,first,140084_1,Neutral,0.0


# Voting and Finalising +ve,-ve and neutral comments

In [86]:
comments = pd.read_csv('lUDozITit6w_Hfiltered.csv', error_bad_lines=False)
comment_ids = list(comments['Comment ID'].to_list())

In [87]:
vote_df = pd.DataFrame()
for comment_id in comment_ids:
    
    pos_count = 0
    neg_count = 0
    neu_count = 0
    
    vader_sentimet_pred = vader_df.loc[vader_df['Comment ID'] == comment_id, 'Sentiment'].values 
    pos_count += sum(vader_sentimet_pred == 'Positive')
    neg_count += sum(vader_sentimet_pred == 'Negative')
    neu_count += sum(vader_sentimet_pred == 'Neutral')
    
    textblob_sentimet_pred = textBlob_df.loc[textBlob_df['Comment ID'] == comment_id, 'Sentiment'].values
    pos_count += sum(textblob_sentimet_pred == 'Positive')
    neg_count += sum(textblob_sentimet_pred == 'Negative')
    neu_count += sum(textblob_sentimet_pred == 'Neutral')
    
    afinn_sentimet_pred = afinn_df.loc[afinn_df['Comment ID'] == comment_id, 'Sentiment'].values
    pos_count += sum(afinn_sentimet_pred == 'Positive')
    neg_count += sum(afinn_sentimet_pred == 'Negative')
    neu_count += sum(afinn_sentimet_pred == 'Neutral')
    
    vote_df = vote_df.append({'Comment ID': comment_id, 'Positive': pos_count, 'Negative': neg_count, 'Neutral': neu_count}, ignore_index=True)

vote_df = vote_df.fillna(0)

vote_df['Positive'] = vote_df['Positive'].astype(int)
vote_df['Negative'] = vote_df['Negative'].astype(int)
vote_df['Neutral'] = vote_df['Neutral'].astype(int)


In [88]:
vote_df

Unnamed: 0,Comment ID,Positive,Negative,Neutral
0,309718_1,0,0,3
1,965018_1,0,0,3
2,20467_1,0,0,3
3,835314_1,0,0,3
4,741056_1,0,0,3
...,...,...,...,...
1016,410410_1,0,1,2
1017,812443_1,0,0,3
1018,837435_1,1,0,2
1019,140084_1,1,0,2


In [89]:
max_sentiment = vote_df[['Positive', 'Negative', 'Neutral']].idxmax(axis=1)

# Split the dataframe into separate dataframes based on the maximum sentiment
positive_df = vote_df[max_sentiment == 'Positive'][['Comment ID']]
negative_df = vote_df[max_sentiment == 'Negative'][['Comment ID']]
neutral_df = vote_df[max_sentiment == 'Neutral'][['Comment ID']]

In [90]:
positive_df

Unnamed: 0,Comment ID
6,143282_1
9,588437_1
10,111840_1
12,257319_1
14,324978_1
...,...
1005,281460_1
1007,435537_1
1008,741909_1
1012,145288_1


In [91]:
neutral_df

Unnamed: 0,Comment ID
0,309718_1
1,965018_1
2,20467_1
3,835314_1
4,741056_1
...,...
1016,410410_1
1017,812443_1
1018,837435_1
1019,140084_1


In [92]:
negative_df

Unnamed: 0,Comment ID
22,580487_1
25,881486_1
26,381478_1
27,632635_1
51,267487_1
...,...
965,631779_1
977,56872_1
979,308918_1
982,424473_1


# Mapping 

In [93]:
master = pd.read_csv("HighLevel_lUDozITit6w_master.csv", encoding='utf-8')

In [94]:
master

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,"Pixel 7a at ₹43,999 (₹39,999 after card discou...",309718_1
1,1,I am in group 1.5 Looking forward for sale price,965018_1
2,2,Camera bar is not aluminium,20467_1
3,3,Can somebody tell me if this phone has heating...,835314_1
4,4,Group 2,741056_1
...,...,...,...
1016,1016,So you upload videos late night,410410_1
1017,1017,Yo 1,812443_1
1018,1018,First,837435_1
1019,1019,First,140084_1


In [95]:
postive_comments = pd.merge(master, positive_df, on='Comment ID', how='inner')
postive_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,6,Because it's not true.,143282_1
1,9,"Kindly make videos about their service, feedba...",588437_1
2,10,"Hey Rupesh, \r\nI own a OnePlus 8t and had a v...",111840_1
3,12,I wanna buy a new phone. But my vivo that i bo...,257319_1
4,14,costs at least 15k more than my current almost...,324978_1
...,...,...,...
361,1005,Nice video sir,281460_1
362,1007,Beebom also fixed their videos and are finally...,435537_1
363,1008,Nice improvement 😸..but little expensive,741909_1
364,1012,Pixel is perfect ❤,145288_1


In [96]:
negative_comments = pd.merge(master, negative_df, on='Comment ID', how='inner')
negative_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,22,5:29 wireless charging is limited to 7.5 watts...,580487_1
1,25,"ghanta still heating issue, bad performanc, ba...",881486_1
2,26,Shame no SD card slot 💳 😭,381478_1
3,27,"Overheating phone guys , don't believe these b...",632635_1
4,51,"Poor performance, poor efficiency. bad device ...",267487_1
...,...,...,...
119,965,i am neutral bcoz I'll wait for big billion sales,631779_1
120,977,"I'm in group 1 until BBD, then I'll be in group 2",56872_1
121,979,"""Google is the worst company on Earth at keepi...",308918_1
122,982,What is this stupid obsession with glass backs...,424473_1


In [97]:
neutral_comments = pd.merge(master, neutral_df, on='Comment ID', how='inner')
neutral_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,"Pixel 7a at ₹43,999 (₹39,999 after card discou...",309718_1
1,1,I am in group 1.5 Looking forward for sale price,965018_1
2,2,Camera bar is not aluminium,20467_1
3,3,Can somebody tell me if this phone has heating...,835314_1
4,4,Group 2,741056_1
...,...,...,...
526,1016,So you upload videos late night,410410_1
527,1017,Yo 1,812443_1
528,1018,First,837435_1
529,1019,First,140084_1


In [98]:
vote_df

Unnamed: 0,Comment ID,Positive,Negative,Neutral
0,309718_1,0,0,3
1,965018_1,0,0,3
2,20467_1,0,0,3
3,835314_1,0,0,3
4,741056_1,0,0,3
...,...,...,...,...
1016,410410_1,0,1,2
1017,812443_1,0,0,3
1018,837435_1,1,0,2
1019,140084_1,1,0,2


In [99]:
import pandas as pd


sentiment_df = vote_df[['Comment ID', 'Positive', 'Negative', 'Neutral']].copy()
sentiment_df['Sentiment'] = sentiment_df[['Positive', 'Negative', 'Neutral']].idxmax(axis=1)

merged_df = pd.merge(master, sentiment_df, left_on='Comment ID', right_on='Comment ID', how='left')
new_df = merged_df[['Comment ID', 'Comments', 'Sentiment']].copy()


In [100]:
new_df.Sentiment.value_counts()

Neutral     531
Positive    366
Negative    124
Name: Sentiment, dtype: int64