# Amazon Reviews sentiment classification

In [1]:
import pandas as pd
import numpy as np

In [2]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\91731\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
#importing SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

VADER's `SentimentIntensityAnalyzer()` takes in a string and returns a dictionary of scores in each of four categories:
* negative
* neutral
* positive
* compound *(computed by normalizing the scores above)*

In [4]:
any_random_review = 'That was a great movie!!!!!!!'
sia.polarity_scores(any_random_review)

{'neg': 0.0, 'neu': 0.363, 'pos': 0.637, 'compound': 0.7405}

In [5]:
# Sentiment analysis on amazon reviews
df = pd.read_csv('UPDATED_NLP_COURSE/TextFiles/amazonreviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [6]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [7]:
blank = []

for i ,lb,rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
blank

df.drop(blank, inplace=True)

In [8]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [9]:
df['scores'] = df['review'].apply(lambda review:sia.polarity_scores(review))
df['scores'][1]

{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'compound': 0.8957}

In [10]:
df['compound'] = df['scores'].apply(lambda score:score['compound'])
df['compound'].head()

0    0.9454
1    0.8957
2    0.9858
3    0.9814
4    0.9781
Name: compound, dtype: float64

In [11]:
df['comp_score'] = df['compound'].apply(lambda comp_score: 'pos' if comp_score>=0 else 'neg')
df['comp_score'].head()

0    pos
1    pos
2    pos
3    pos
4    pos
Name: comp_score, dtype: object

In [12]:
from sklearn.metrics import classification_report ,confusion_matrix,ConfusionMatrixDisplay
print(classification_report(df['label'] , df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [13]:
print(confusion_matrix(df['label'],df['comp_score']))

[[2629 2468]
 [ 435 4468]]


# Movie Reviews sentiment classification

In [14]:
# Sentiment analysis on movie reviews
df = pd.read_csv('UPDATED_NLP_COURSE/TextFiles/moviereviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [16]:
df['label'].value_counts()

pos    1000
neg    1000
Name: label, dtype: int64

In [17]:
df.dropna(inplace=True)

In [18]:
blank = []

for i ,lb,rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blank.append(i)
blank

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [19]:
df.drop(blank,inplace=True)

In [20]:
df['label'].value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [21]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [22]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [23]:
df['scores'] = df['review'].apply(lambda scores :sia.polarity_scores(scores))
df.head()

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."


In [24]:
df['scores'][0]

{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'compound': -0.9125}

In [25]:
df['compound'] = df['scores'].apply(lambda scores: scores['compound'])
df['compound'].head()

0   -0.9125
1   -0.8618
2    0.9951
3    0.9972
4   -0.2484
Name: compound, dtype: float64

In [26]:
df['comp_score'] = df['compound'].apply(lambda x: 'pos' if x>=0 else 'neg')
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


In [27]:
print(classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938

