# VADER

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('vader_lexicon')
%matplotlib inline
import warnings
from nltk.sentiment.vader import SentimentIntensityAnalyzer

warnings.filterwarnings("ignore")

import os

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [70]:
vader_df = pd.read_csv('lUDozITit6w_Hfiltered.csv', error_bad_lines=False)

In [71]:
vader_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,pixel a at after card discount so are you in g...,460911_1
1,1,camera bar is not aluminium,520609_1
2,2,can somebody tell me if this phone has heating...,595046_1
3,3,group,876569_1
4,4,waiting for it to come down to or lower,801723_1
...,...,...,...
1007,1015,so you upload videos late night,263817_1
1008,1016,yo,131094_1
1009,1017,first,985076_1
1010,1018,first,118681_1


In [72]:
vader_df['Comments'] = vader_df['Comments'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
vader_df['Comments'] = vader_df['Comments'].apply(lambda x:x.lower())

tokenized_tweet = vader_df['Comments'].apply(lambda x: x.split())

wnl = WordNetLemmatizer()

tokenized_tweet.apply(lambda x: [wnl.lemmatize(i) for i in x if i not in set(stopwords.words('english'))]) 
tokenized_tweet.head()

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
    
vader_df['Comments'] = tokenized_tweet

sia = SentimentIntensityAnalyzer()
vader_df['Sentiment Scores'] = vader_df['Comments'].apply(lambda x:sia.polarity_scores(x)['compound'])
vader_df['Sentiment'] = vader_df['Sentiment Scores'].apply(lambda s : 'Positive' if s > 0 else ('Neutral' if s == 0 else 'Negative'))

In [73]:
vader_df.Sentiment.value_counts()

Neutral     522
Positive    366
Negative    124
Name: Sentiment, dtype: int64

In [74]:
vader_percentages = vader_df['Sentiment'].value_counts(normalize=True) * 100
vader_percentages

Neutral     51.581028
Positive    36.166008
Negative    12.252964
Name: Sentiment, dtype: float64

In [75]:
vader_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment Scores,Sentiment
0,0,pixel after card discount group group,460911_1,0.000,Neutral
1,1,camera aluminium,520609_1,0.000,Neutral
2,2,somebody tell this phone heating issues,595046_1,0.000,Neutral
3,3,group,876569_1,0.000,Neutral
4,4,waiting come down lower,801723_1,-0.296,Negative
...,...,...,...,...,...
1007,1015,upload videos late night,263817_1,0.000,Neutral
1008,1016,,131094_1,0.000,Neutral
1009,1017,first,985076_1,0.000,Neutral
1010,1018,first,118681_1,0.000,Neutral


# Text Blob

In [76]:
from textblob import TextBlob
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

textBlob_df = pd.read_csv('lUDozITit6w_Hfiltered.csv')
textBlob_df['Sentiment Scores'] = ''
textBlob_df['Sentiment'] = ''

def preprocess_text(text):
    text = text.lower()    
    text = text.translate(str.maketrans('', '', string.punctuation))    
    tokens = text.split()

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

for index, row in textBlob_df.iterrows():
    comment = row['Comments']    
    preprocessed_comment = preprocess_text(comment)

    blob = TextBlob(preprocessed_comment)
    polarity = blob.sentiment.polarity

    textBlob_df.at[index, 'Sentiment Scores'] = polarity

    if polarity > 0:
        textBlob_df.at[index, 'Sentiment'] = 'Positive'
    elif polarity < 0:
        textBlob_df.at[index, 'Sentiment'] = 'Negative'
    else:
        textBlob_df.at[index, 'Sentiment'] = 'Neutral'


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
textBlob_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment Scores,Sentiment
0,0,pixel a at after card discount so are you in g...,460911_1,0.0,Neutral
1,1,camera bar is not aluminium,520609_1,0.0,Neutral
2,2,can somebody tell me if this phone has heating...,595046_1,0.0,Neutral
3,3,group,876569_1,0.0,Neutral
4,4,waiting for it to come down to or lower,801723_1,0.0,Neutral
...,...,...,...,...,...
1007,1015,so you upload videos late night,263817_1,-0.3,Negative
1008,1016,yo,131094_1,0.0,Neutral
1009,1017,first,985076_1,0.25,Positive
1010,1018,first,118681_1,0.25,Positive


In [78]:
textBlob_df.Sentiment.value_counts()

Neutral     583
Positive    326
Negative    103
Name: Sentiment, dtype: int64

In [79]:
textBlob_percentages = textBlob_df['Sentiment'].value_counts(normalize=True) * 100
textBlob_percentages

Neutral     57.608696
Positive    32.213439
Negative    10.177866
Name: Sentiment, dtype: float64

# AFINN Model

In [80]:
import pandas as pd
from afinn import Afinn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

afinn_df = pd.read_csv("lUDozITit6w_Hfiltered.csv")

afinn = Afinn()

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def get_sentiment(text):
    sentiment_score = afinn.score(text)
    if sentiment_score > 0:
        return "Positive"
    elif sentiment_score < 0:
        return "Negative"
    else:
        return "Neutral"

afinn_df["Sentiment"] = afinn_df["Comments"].apply(preprocess_text).apply(get_sentiment)
afinn_df["Sentiment Scores"] = afinn_df["Comments"].apply(preprocess_text).apply(afinn.score)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/satyasasivatsal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [81]:
afinn_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment,Sentiment Scores
0,0,pixel a at after card discount so are you in g...,460911_1,Neutral,0.0
1,1,camera bar is not aluminium,520609_1,Neutral,0.0
2,2,can somebody tell me if this phone has heating...,595046_1,Neutral,0.0
3,3,group,876569_1,Neutral,0.0
4,4,waiting for it to come down to or lower,801723_1,Neutral,0.0
...,...,...,...,...,...
1007,1015,so you upload videos late night,263817_1,Neutral,0.0
1008,1016,yo,131094_1,Neutral,0.0
1009,1017,first,985076_1,Neutral,0.0
1010,1018,first,118681_1,Neutral,0.0


In [82]:
afinn_df.Sentiment.value_counts()

Neutral     523
Positive    357
Negative    132
Name: Sentiment, dtype: int64

In [83]:
afinn_percentages = afinn_df['Sentiment'].value_counts(normalize=True) * 100
afinn_percentages

Neutral     51.679842
Positive    35.276680
Negative    13.043478
Name: Sentiment, dtype: float64

In [84]:
afinn_df

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID,Sentiment,Sentiment Scores
0,0,pixel a at after card discount so are you in g...,460911_1,Neutral,0.0
1,1,camera bar is not aluminium,520609_1,Neutral,0.0
2,2,can somebody tell me if this phone has heating...,595046_1,Neutral,0.0
3,3,group,876569_1,Neutral,0.0
4,4,waiting for it to come down to or lower,801723_1,Neutral,0.0
...,...,...,...,...,...
1007,1015,so you upload videos late night,263817_1,Neutral,0.0
1008,1016,yo,131094_1,Neutral,0.0
1009,1017,first,985076_1,Neutral,0.0
1010,1018,first,118681_1,Neutral,0.0


# Voting and Finalising +ve,-ve and neutral comments

In [85]:
comments = pd.read_csv('lUDozITit6w_Hfiltered.csv', error_bad_lines=False)
comment_ids = list(comments['Comment ID'].to_list())

In [86]:
vote_df = pd.DataFrame()
for comment_id in comment_ids:
    
    pos_count = 0
    neg_count = 0
    neu_count = 0
    
    vader_sentimet_pred = vader_df.loc[vader_df['Comment ID'] == comment_id, 'Sentiment'].values 
    pos_count += sum(vader_sentimet_pred == 'Positive')
    neg_count += sum(vader_sentimet_pred == 'Negative')
    neu_count += sum(vader_sentimet_pred == 'Neutral')
    
    textblob_sentimet_pred = textBlob_df.loc[textBlob_df['Comment ID'] == comment_id, 'Sentiment'].values
    pos_count += sum(textblob_sentimet_pred == 'Positive')
    neg_count += sum(textblob_sentimet_pred == 'Negative')
    neu_count += sum(textblob_sentimet_pred == 'Neutral')
    
    afinn_sentimet_pred = afinn_df.loc[afinn_df['Comment ID'] == comment_id, 'Sentiment'].values
    pos_count += sum(afinn_sentimet_pred == 'Positive')
    neg_count += sum(afinn_sentimet_pred == 'Negative')
    neu_count += sum(afinn_sentimet_pred == 'Neutral')
    
    vote_df = vote_df.append({'Comment ID': comment_id, 'Positive': pos_count, 'Negative': neg_count, 'Neutral': neu_count}, ignore_index=True)

vote_df = vote_df.fillna(0)

vote_df['Positive'] = vote_df['Positive'].astype(int)
vote_df['Negative'] = vote_df['Negative'].astype(int)
vote_df['Neutral'] = vote_df['Neutral'].astype(int)


In [87]:
vote_df

Unnamed: 0,Comment ID,Positive,Negative,Neutral
0,460911_1,0,0,3
1,520609_1,0,0,3
2,595046_1,0,0,3
3,876569_1,0,0,3
4,801723_1,0,1,2
...,...,...,...,...
1007,263817_1,0,1,2
1008,131094_1,0,0,3
1009,985076_1,1,0,2
1010,118681_1,1,0,2


In [88]:
max_sentiment = vote_df[['Positive', 'Negative', 'Neutral']].idxmax(axis=1)

# Split the dataframe into separate dataframes based on the maximum sentiment
positive_df = vote_df[max_sentiment == 'Positive'][['Comment ID']]
negative_df = vote_df[max_sentiment == 'Negative'][['Comment ID']]
neutral_df = vote_df[max_sentiment == 'Neutral'][['Comment ID']]

In [89]:
positive_df

Unnamed: 0,Comment ID
5,630401_1
8,660092_1
9,690546_1
11,508923_1
13,403326_1
...,...
996,992176_1
998,409132_1
999,477558_1
1003,36692_1


In [90]:
neutral_df

Unnamed: 0,Comment ID
0,460911_1
1,520609_1
2,595046_1
3,876569_1
4,801723_1
...,...
1007,263817_1
1008,131094_1
1009,985076_1
1010,118681_1


In [91]:
negative_df

Unnamed: 0,Comment ID
16,610306_1
21,72677_1
24,257863_1
25,764591_1
26,23951_1
...,...
952,211660_1
954,2021_1
971,960588_1
974,869643_1


# Mapping 

In [92]:
master = pd.read_csv("HighLevel_lUDozITit6w_master.csv", encoding='utf-8')

In [93]:
master

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,"Pixel 7a at ₹43,999 (₹39,999 after card discou...",460911_1
1,1,Camera bar is not aluminium,520609_1
2,2,Can somebody tell me if this phone has heating...,595046_1
3,3,Group 2,876569_1
4,4,Waiting for it to come down to 29999 or lower 😅,801723_1
...,...,...,...
1015,1015,So you upload videos late night,263817_1
1016,1016,Yo 1,131094_1
1017,1017,First,985076_1
1018,1018,First,118681_1


In [94]:
postive_comments = pd.merge(master, positive_df, on='Comment ID', how='inner')
postive_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,5,Because it's not true.,630401_1
1,8,"Kindly make videos about their service, feedba...",660092_1
2,9,"Hey Rupesh, \r\nI own a OnePlus 8t and had a v...",690546_1
3,11,I wanna buy a new phone. But my vivo that i bo...,508923_1
4,13,costs at least 15k more than my current almost...,403326_1
...,...,...,...
360,1004,Nice video sir,992176_1
361,1006,Beebom also fixed their videos and are finally...,409132_1
362,1007,Nice improvement 😸..but little expensive,477558_1
363,1011,Pixel is perfect ❤,36692_1


In [95]:
negative_comments = pd.merge(master, negative_df, on='Comment ID', how='inner')
negative_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,16,I am in the i exchanged my 6a and brought an 7...,610306_1
1,21,5:29 wireless charging is limited to 7.5 watts...,72677_1
2,24,"ghanta still heating issue, bad performanc, ba...",257863_1
3,25,Shame no SD card slot 💳 😭,764591_1
4,26,"Overheating phone guys , don't believe these b...",23951_1
...,...,...,...
115,959,Pixel 7 is already at 44k on Amazon 🤣\nWhat is...,211660_1
116,961,I would be stupid big time to buy this at 45k ...,2021_1
117,978,"""Google is the worst company on Earth at keepi...",960588_1
118,981,What is this stupid obsession with glass backs...,869643_1


In [96]:
neutral_comments = pd.merge(master, neutral_df, on='Comment ID', how='inner')
neutral_comments

Unnamed: 0.1,Unnamed: 0,Comments,Comment ID
0,0,"Pixel 7a at ₹43,999 (₹39,999 after card discou...",460911_1
1,1,Camera bar is not aluminium,520609_1
2,2,Can somebody tell me if this phone has heating...,595046_1
3,3,Group 2,876569_1
4,4,Waiting for it to come down to 29999 or lower 😅,801723_1
...,...,...,...
522,1015,So you upload videos late night,263817_1
523,1016,Yo 1,131094_1
524,1017,First,985076_1
525,1018,First,118681_1


In [97]:
vote_df

Unnamed: 0,Comment ID,Positive,Negative,Neutral
0,460911_1,0,0,3
1,520609_1,0,0,3
2,595046_1,0,0,3
3,876569_1,0,0,3
4,801723_1,0,1,2
...,...,...,...,...
1007,263817_1,0,1,2
1008,131094_1,0,0,3
1009,985076_1,1,0,2
1010,118681_1,1,0,2


In [98]:
import pandas as pd


sentiment_df = vote_df[['Comment ID', 'Positive', 'Negative', 'Neutral']].copy()
sentiment_df['Sentiment'] = sentiment_df[['Positive', 'Negative', 'Neutral']].idxmax(axis=1)

merged_df = pd.merge(master, sentiment_df, left_on='Comment ID', right_on='Comment ID', how='left')
new_df = merged_df[['Comment ID', 'Comments', 'Sentiment']].copy()


In [99]:
new_df.Sentiment.value_counts()

Neutral     527
Positive    365
Negative    120
Name: Sentiment, dtype: int64