<a href="https://colab.research.google.com/github/souparnabose99/Sentiment-Analysis-NLTK/blob/main/Movie_Sentiment_Analysis_NLTK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Dataset & Libraries:

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

!wget https://raw.githubusercontent.com/souparnabose99/Sentiment-Analysis-NLTK/main/moviereviews.tsv

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
--2021-06-26 08:52:49--  https://raw.githubusercontent.com/souparnabose99/Sentiment-Analysis-NLTK/main/moviereviews.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 



200 OK
Length: 7571363 (7.2M) [text/plain]
Saving to: ‘moviereviews.tsv’


2021-06-26 08:52:49 (19.1 MB/s) - ‘moviereviews.tsv’ saved [7571363/7571363]



In [2]:
pd.set_option('Display.max_columns', None)
df = pd.read_csv('moviereviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1965 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [4]:
df.describe()

Unnamed: 0,label,review
count,2000,1965.0
unique,2,1939.0
top,pos,
freq,1000,27.0


### Check for blanks & missing values:

In [5]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [6]:
df.shape

(2000, 2)

In [7]:
df.dropna(axis=0, inplace=True)
df.shape

(1965, 2)

In [8]:
blanks = []

for ind, lab, rev in df.itertuples():
  if type(rev)==str:
    if rev.isspace():
      blanks.append(ind)

In [9]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [10]:
df['label'].value_counts()

neg    983
pos    982
Name: label, dtype: int64

In [11]:
df.drop(blanks, inplace=True)
df['label'].value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [12]:
sia = SentimentIntensityAnalyzer()

### Create scores & compound scores:

In [13]:
df['scores'] = df['review'].apply(lambda rev: sia.polarity_scores(rev))
df.head()

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com..."


In [14]:
df['compound'] = df['scores'].apply(lambda rev: rev.get('compound'))
df.head()

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264


In [15]:
df['compound_label'] = df['compound'].apply(lambda score: 'pos' if score>=0 else 'neg')
df.head()

Unnamed: 0,label,review,scores,compound,compound_label
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264,neg


In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy_score(df['label'], df['compound_label'])

0.6367389060887513

In [17]:
print(classification_report(df['label'], df['compound_label']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



In [18]:
print(confusion_matrix(df['label'], df['compound_label']))

[[427 542]
 [162 807]]
