# **Movie Review Project - Sentiment Analysis**
---
---

## **Import Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np

### **Mounting Google Drive**

In [30]:
from google.colab import drive

In [3]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Load Data**

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Jose Potilla | NLP/Data/moviereviews.tsv", sep ="\t")

In [5]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


### **PDA**

In [6]:
# Check the Shape of our Given DataSet

df.shape

(2000, 2)

In [7]:
df.dropna(inplace = True)

In [8]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [9]:
df.shape

(1965, 2)

**Remove the rows containing blank in review**

In [10]:


blanks = []

for i, lb, rev in df.itertuples():
  # i = Index, lb = label , rev = review
  if type(rev) == str:
    if rev.isspace():
      blanks.append(i)

In [11]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [12]:
# Example

df.iloc[[57]]

Unnamed: 0,label,review
57,neg,


In [13]:
df.drop(blanks, inplace = True)

In [14]:
df.shape

(1938, 2)

In [15]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

## **Sentiment Analysis**

In [16]:
import nltk

In [17]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### **vader**

In [18]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [19]:
sid = SentimentIntensityAnalyzer()

In [20]:
df['scores'] = df['review'].apply(lambda review : sid.polarity_scores(review))

In [21]:
df.head()

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."


In [22]:
df['compound'] = df['scores'].apply(lambda scores : scores['compound'])

In [23]:
df.head()

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484


In [24]:
df['comp_score'] = df['compound'].apply(lambda compound : 'pos' if compound>=0 else 'neg')

In [25]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


## **Evaluation**

In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [27]:
accuracy_score(df['label'], df['comp_score'])

0.6357069143446853

In [28]:
print(classification_report(df['label'], df['comp_score']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



In [29]:
confusion_matrix(df['label'], df['comp_score'])

array([[427, 542],
       [164, 805]])