# **Sentiment Analysis on Movie Reviews**
##Movie review dataset of a popular review site

In [60]:
#importing libraries
import pandas as pd
import  numpy as np

In [61]:
#from google.colab import drive
#drive.mount('/content/drive')

In [62]:
#Reading the dataset
data = pd.read_csv("labeledTrainData.tsv", sep = "\t")

In [63]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [64]:
data.shape

(25000, 3)

The data set consist of 25000 rows and 3 columns

In [65]:
data.sentiment.value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

The Label 1 stands for Positive Reviews and label 0 stands for negative reviews.
If the movie rating is  less than 5, it is considered as a negative review (0) and if the movie rating is greater than or equal to 7,  it is considered as a positive review (1).
Here, we have 12500 movies with negative review and 12500 movies with positive review.

In [66]:
data.columns

Index(['id', 'sentiment', 'review'], dtype='object')

**Data Preprocessing:**

In [67]:
#Natural Language Tool Kit and Stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
#To remove HTML and XML strips
from bs4 import BeautifulSoup
import re

In [69]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [70]:
#Apply function on review column
data['review'] = data['review'].apply(denoise_text)

  soup = BeautifulSoup(text, "html.parser")


In [71]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


Removing special characters from the dataset:

In [72]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^ ^", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [73]:
#Apply function on review column
data['review'] = data['review'].apply(remove_special_characters)

In [74]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,The Classic War of the Worlds by Timothy Hine...
2,7759_3,0,The film starts with a manager Nicholas Bell g...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


Thus, the special characters are removed.Now we need to do stemming - to minimize the confusion around words that have similar meanings

In [75]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

In [76]:
#Apply function on review column
data['review']= data['review'].apply(simple_stemmer)

In [77]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

{'me', 'wouldn', 'ma', 'or', 'yourself', 'in', 'why', 've', 'll', 's', 'themselves', 'between', 'own', 'who', 'so', 'they', "shan't", 'about', 'only', 'because', 'but', 'below', 'off', 'nor', 'shouldn', 'this', 'be', "hadn't", "it's", 'to', 'once', 'at', 'some', 'what', 'i', 'these', 'their', 'such', 'both', 'him', 'myself', 'did', "mustn't", 'all', 'as', 'here', 'yourselves', 'you', 'am', 'being', "you'd", 'doesn', 'couldn', 'were', 'ain', 'which', 'd', 'the', 'didn', 'needn', "wasn't", "wouldn't", 'over', 'y', "should've", 'on', 'other', 'for', 'isn', 'through', "haven't", 'down', 'too', 'won', "don't", 'o', 'of', "she's", "aren't", 'wasn', 'has', 'theirs', 'up', 'don', 'aren', 'during', 'whom', 'your', 'with', 'most', 'it', 'our', 'mustn', 'that', 'had', 'hers', "needn't", 'itself', 'been', 'now', 'when', 'ours', "you've", 'we', 'was', 'them', 'ourselves', "didn't", 'he', 'doing', "isn't", 'against', 'weren', 'can', 'out', 'yours', 'an', 'than', 'she', 'any', 'each', 'and', 'my', 't

The above are the stop words found in the dataset.

**Tokenizing the text:**

In [79]:
from nltk.tokenize.toktok import ToktokTokenizer

In [80]:
tokenizer1=ToktokTokenizer()

In [81]:
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [82]:
#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer1.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [83]:
#Apply function on review column
data['review'] = data['review'].apply(remove_stopwords)

The stop words are also removed.

Now, the cleaning and preprocessing part got completed

**Feature engineering in text:**


In [84]:
data['review']

0        thi stuff go moment mj ive start listen hi mus...
1        classic war world timothi hine veri entertain ...
2        film start manag nichola bell give welcom inve...
3        must assum prais thi film greatest film opera ...
4        superbl trashi wondrous unpretenti 80 exploit ...
                               ...                        
24995    seem like consider ha gone imdb review thi fil...
24996    dont believ made thi film complet unnecessari ...
24997    guy loser cant get girl need build pick strong...
24998    thi 30 minut documentari buuel made earli 1930...
24999    saw thi movi child broke heart stori unfinish ...
Name: review, Length: 25000, dtype: object

In [85]:
all_words = " ".join(data['review'])

In [86]:
nltk.download("punkt")
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [87]:
all_words = word_tokenize(all_words)

In [88]:
#To find frequency of words
from nltk import FreqDist

In [89]:
dist = FreqDist(all_words)

In [90]:
dist

FreqDist({'thi': 73238, 'movi': 49626, 'wa': 47999, 'film': 46326, 'hi': 29240, 'one': 26324, 'like': 22075, 'ha': 16840, 'time': 15092, 'good': 14720, ...})

In [91]:
num_unique_word = len(dist)

In [92]:
num_unique_word

110996

Thus, there are 110996 unique words in the dataset

In [93]:
r_len = []

for text in data['review']:
  word = word_tokenize(text)
  l = len(word)
  r_len.append(l)

In [94]:
import numpy as np
MAX_REVIEW_LEN = np.max(r_len)

In [95]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [96]:
tokenizer = Tokenizer(num_words = num_unique_word)

In [97]:
### fit_on_texts
tokenizer.fit_on_texts(list(data['review']))

In [98]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,thi stuff go moment mj ive start listen hi mus...
1,2381_9,1,classic war world timothi hine veri entertain ...
2,7759_3,0,film start manag nichola bell give welcom inve...
3,3630_4,0,must assum prais thi film greatest film opera ...
4,9495_8,1,superbl trashi wondrous unpretenti 80 exploit ...


In [99]:
data["review"] = tokenizer.texts_to_sequences(data['review'])

In [100]:
data

Unnamed: 0,id,sentiment,review
0,5814_8,1,"[1, 491, 29, 183, 6843, 137, 95, 913, 5, 98, 1..."
1,2381_9,1,"[249, 232, 109, 3165, 5742, 13, 182, 4, 481, 2..."
2,7759_3,0,"[4, 95, 362, 3988, 2322, 65, 1826, 11819, 514,..."
3,3630_4,0,"[155, 1200, 1680, 1, 4, 743, 4, 1305, 64, 86, ..."
4,9495_8,1,"[3029, 3492, 29452, 8521, 687, 1094, 10314, 19..."
...,...,...,...
24995,3453_3,0,"[47, 7, 1924, 8, 760, 865, 242, 1, 4, 381, 110..."
24996,5064_1,0,"[40, 106, 42, 1, 4, 159, 1603, 36, 4, 3, 869, ..."
24997,10905_3,0,"[91, 2027, 110, 12, 105, 131, 641, 522, 2889, ..."
24998,10194_3,0,"[1, 896, 121, 540, 15600, 42, 356, 2113, 6, 40..."


In [101]:
from tensorflow.keras.preprocessing import sequence

In [102]:
#Defining X and y
X = sequence.pad_sequences(data['review'], 38)
y = data['sentiment']

In [103]:
from sklearn.model_selection import train_test_split

In [104]:
#Splitting to Train and Test data
x_train, x_test, y_train , y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

**Building Model:**

In [105]:
from tensorflow.keras.models import Sequential

In [106]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding

In [107]:
model = Sequential()
model.add(Embedding(input_dim = 110996, output_dim = 150, input_length = 38))

model.add(LSTM(128, dropout = 0.2 ))

model.add(Dense(1, activation = "sigmoid"))

In [108]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 38, 150)           16649400  
                                                                 
 lstm_1 (LSTM)               (None, 128)               142848    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 16,792,377
Trainable params: 16,792,377
Non-trainable params: 0
_________________________________________________________________


In [109]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy'])

In [110]:
history1 = model.fit(x_train, y_train, epochs = 3, batch_size = 32)

Epoch 1/3
Epoch 2/3
Epoch 3/3


**Model Evaluation:**

In [111]:
model.evaluate(x_test, y_test)



[0.5374519228935242, 0.8055999875068665]

# **Thus, the accuracy of the model is seen as 80.56%**

**Model 2:**

In [112]:
model2 = Sequential()
model2.add(Embedding(input_dim = 110996, output_dim = 150, input_length = 38))

model2.add(LSTM(128, dropout = 0.2 ))

model2.add(Dense(1, activation = "sigmoid"))

In [113]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 38, 150)           16649400  
                                                                 
 lstm_2 (LSTM)               (None, 128)               142848    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 16,792,377
Trainable params: 16,792,377
Non-trainable params: 0
_________________________________________________________________


In [117]:
model2.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy'])

In [118]:
history2 = model2.fit(x_train, y_train, epochs = 20, batch_size = 32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [119]:
model2.evaluate(x_test, y_test)



[1.4984177350997925, 0.7802000045776367]

# The accuracy of the model "**model2**" is found to be 78.02%

**Model 3:**

In [121]:
model3 = Sequential()
model3.add(Embedding(input_dim = 110996, output_dim = 150, input_length = 38))

model3.add(LSTM(128, dropout = 0.2 ))

model3.add(Dense(1, activation = "relu"))

In [122]:
model3.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 38, 150)           16649400  
                                                                 
 lstm_4 (LSTM)               (None, 128)               142848    
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 16,792,377
Trainable params: 16,792,377
Non-trainable params: 0
_________________________________________________________________


In [123]:
model3.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy'])

In [125]:
history3 = model3.fit(x_train, y_train, epochs = 10, batch_size = 32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [126]:
model3.evaluate(x_test, y_test)



[1.9930142164230347, 0.8022000193595886]

# The accuracy of the model "**model3**" is found to be 80.22%

# Among the above 3 models, the one with name "**model**" shows the highest accuracy as 80.56% and this model can be considered