In [1]:
#Import pandas library
import pandas as pd
import numpy as np
import re

#Import TextBlob library
from textblob import TextBlob

# Import VADER sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Import the movie review data as a data frame and ensure that the data is loaded properly.

In [2]:
# For train
df_review = pd.read_csv('labeledTrainData.tsv',sep='\t')

In [3]:
df_review.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


# How many of each positive and negative reviews are there?

In [5]:
df_review.describe()

Unnamed: 0,sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


By observing the mean, we can conclude that we have 50% positive reviews and 50% negative reviews

# Use TextBlob to classify each movie review as positive or negative. Assume that a polarity score greater than or equal to zero is a positive sentiment and less than 0 is a negative sentiment.

In [6]:
# Get sentiment score for each review
df_review['review_scores'] = df_review['review'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [7]:
# Predict sentiment label for each review
df_review['review_predictivity'] = df_review['review_scores'].apply(lambda x: 1 if x >=0 else 0)

In [8]:
df_review

Unnamed: 0,id,sentiment,review,review_scores,review_predictivity
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0
...,...,...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...,0.102083,1
24996,5064_1,0,I don't believe they made this film. Completel...,0.090813,1
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil...",0.145256,1
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...,0.065625,1


# Check the accuracy of this model. Is this model better than random guessing?

In [9]:
# To check if the model is accurate, we need to compare between sentiment and review_preditivity.

# Import accuracy_score to check performance
from sklearn.metrics import accuracy_score

accuracy_score(df_review['sentiment'],df_review['review_predictivity'])

0.68524

# For up to five points extra credit, use another prebuilt text sentiment analyzer, e.g., VADER, and repeat steps (3) and (4).

In [None]:
# Get sentiment score for each review
vader_sentiment = SentimentIntensityAnalyzer()
df_review['VADER_review_scores'] = df_review['review'].apply(lambda x: vader_sentiment.polarity_scores(x)['compound'])

In [None]:
# Predict sentiment label for each review
df_review['VADER_review_prediction'] = df_review['VADER_review_scores'].apply(lambda x: 1 if x >=0.05 else 0)

In [16]:
df_review['VADER_review_prediction']

0        0
1        1
2        0
3        0
4        1
        ..
24995    1
24996    1
24997    1
24998    0
24999    1
Name: VADER_review_prediction, Length: 25000, dtype: int64

In [17]:
df_review.head()

Unnamed: 0,id,sentiment,review,review_scores,review_predictivity,VADER_review_scores,VADER_review_prediction
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,-0.8879,0
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,0.9736,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,-0.9883,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,-0.1202,0
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.6115,1


In [15]:
# Compare Actual score and Predicted VADER score
accuracy_score(df_review['sentiment'],df_review['VADER_review_prediction'])

0.69556

Accuracy of VADER and TextBlob is 69.56% and 68.52% respectively