# Rotten Tomatoes movie review classifier using Keras and Tensorflow

## Author: [Dr. Rahul Remanan](https://www.linkedin.com/in/rahulremanan) {rahul@moad.computer}




*   [Kaggle Rotten Tomatoes datasets](https://www.kaggle.com/c/movie-review-sentiment-analysis-kernels-only/data)
* [This is a modified fork of the Kaggle kernel here](https://www.kaggle.com/nafisur/keras-models-lstm-cnn-gru-bidirectional-glove)


The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset. The train/test split has been preserved for the purposes of benchmarking, but the sentences have been shuffled from their original order. Each Sentence has been parsed into many phrases by the Stanford parser. Each phrase has a PhraseId. Each sentence has a SentenceId. Phrases that are repeated (such as short/common words) are only included once in the data.

## [Open this notebook in Google CoLab](https://colab.research.google.com/github/rahulremanan/python_tutorial/blob/master/NLP/10-Sentiment_analysis/notebook/Rotten_Tomatoes_movie_reviews_classifier.ipynb)



## Upload Kaggle authentication token

Before downloading the data, ensure that the [terms of the competition](https://www.kaggle.com/c/movie-review-sentiment-analysis-kernels-only/rules) is accepted.

In [0]:
import os

In [0]:
colab_mode = True
download_rawData = True
setup = True

ROOT_DIR = '/content/'
WEIGHTS_FILENAME = 'RT_LSTM.h5'
WEIGHTS_FILE = os.path.join(ROOT_DIR, WEIGHTS_FILENAME)

In [0]:
from google.colab import files

In [0]:
if colab_mode and download_rawData:
  files.upload()

In [0]:
if colab_mode and download_rawData:
  ! mkdir /root/.kaggle/
  ! mv /content/kaggle.json /root/.kaggle/

In [0]:
if setup:
  ! pip install kaggle

## Download the Rotten Tomatoes movie reviews dataset

In [0]:
! kaggle competitions download -c movie-review-sentiment-analysis-kernels-only

In [0]:
! rm /root/.kaggle/kaggle.json

In [0]:
! unzip -q /content/train.tsv.zip
! unzip -q /content/test.tsv.zip

## Import dependencies

In [0]:
import nltk
import os
import gc
import warnings
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,GRU,SpatialDropout1D,Bidirectional
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
warnings.filterwarnings("ignore")
#pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_colwidth', -1)

## Read the train data file

In [0]:
train=pd.read_csv('/content/train.tsv',sep='\t')
print(train.shape)
train.head()

## Summarize the training data

### Get the [unqiue label values](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.unique.html) in the training data

The sentiment labels are:

* 0 - negative
* 1 - somewhat negative
* 2 - neutral
* 3 - somewhat positive
* 4 - positive

In [0]:
train['Sentiment'].unique()

### Count the total number of training items

In [0]:
len(train['Sentiment'])

### Summarize the distribution of the sentiment classes

In [0]:
train.groupby('Sentiment')['PhraseId'].nunique()

## Load test data

In [0]:
test=pd.read_csv('/content/test.tsv',sep='\t')
print(test.shape)
test.head()

## Load the sample submission file

In [0]:
sub=pd.read_csv('/content/sampleSubmission.csv')
sub.head()

## Create sentiment column in the test dataset

In [0]:
test['Sentiment']=-999
test.head()

## Create a dataframe to store both train and test data

In [0]:
df=pd.concat([train,
              test], ignore_index=True)
print(df.shape)
df.tail()

In [0]:
del train,test
gc.collect()

## Pre-process the movie review string

In [0]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()
from string import punctuation
import re

### Download NLTK datasets

Specify the NLTK corpus as 'punkt' or 'all'

In [0]:
nltk.download()

In [0]:
def clean_review(review_col):
    review_corpus=[]
    for i in range(0,len(review_col)):
        review=str(review_col[i])
        review=re.sub('[^a-zA-Z]',' ',review)
        #review=[stemmer.stem(w) for w in word_tokenize(str(review).lower())]
        review=[lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review=' '.join(review)
        review_corpus.append(review)
    return review_corpus

In [0]:
df['clean_review']=clean_review(df.Phrase.values)
df.head()

In [0]:
df_train=df[df.Sentiment!=-999]
print (df_train.shape)
df_train.head()

In [0]:
df_test=df[df.Sentiment==-999]
df_test.drop('Sentiment',axis=1,inplace=True)
print(df_test.shape)
df_test.head()

In [0]:
del df
gc.collect()

In [0]:
train_text=df_train.clean_review.values
test_text=df_test.clean_review.values
target=df_train.Sentiment.values

## Convert labels to categorical variables

In [0]:
y=to_categorical(target)
print(train_text.shape,target.shape,y.shape)

## Create train-validation split for training the model

In [0]:
X_train_text,X_val_text,y_train,y_val=train_test_split(train_text,y,test_size=0.2,stratify=y,random_state=123)
print(X_train_text.shape,y_train.shape)
print(X_val_text.shape,y_val.shape)

In [0]:
all_words=' '.join(X_train_text)
all_words=word_tokenize(all_words)
dist=FreqDist(all_words)
num_unique_word=len(dist)
num_unique_word

## Finding the maximum length of the review in the training corpus

In [0]:
r_len=[]
for text in X_train_text:
    word=word_tokenize(text)
    l=len(word)
    r_len.append(l)
    
MAX_REVIEW_LEN=np.max(r_len)
MAX_REVIEW_LEN

In [0]:
max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes=y.shape[1]
print ('Total number of sentiment classes: {} ...'.format(num_classes))

## Tokenize the input text

Tokenizing using [Keras text pre-processor](https://keras.io/preprocessing/text/). This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_text))
X_train = tokenizer.texts_to_sequences(X_train_text)
X_val = tokenizer.texts_to_sequences(X_val_text)

## Padding the input text for a fixed input length

In [0]:
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
print(X_train.shape,X_val.shape)

## The role of [embedding layer in a neural network](https://towardsdatascience.com/deep-learning-4-embedding-layers-f9a02d55ac12)




1.  One-hot encoded vectors are high-dimensional and sparse. Let’s assume that we are doing Natural Language Processing (NLP) and have a dictionary of 2000 words. This means that, when using one-hot encoding, each word will be represented by a vector containing 2000 integers. And 1999 of these integers are zeros. In a big dataset this approach is not computationally efficient.

2.   The vectors of each embedding get updated while training the neural network. If you have seen the image at the top of this post you can see how similarities between words can be found in a multi-dimensional space. This allows us to visualize relationships between words, but also between everything that can be turned into a vector through an embedding layer.

[Read more about keras embedding layer](https://keras.io/layers/embeddings/#embedding)




## Create a recurrent neural network model

In [0]:
def model_LSTM():
  model=Sequential()
  model.add(Embedding(max_features,100,mask_zero=True))
  model.add(LSTM(64,dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
  model.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
  model.add(Dense(4096, activation='tanh'))
  model.add(Dense(num_classes,activation='softmax'))
  return model

In [0]:
model = model_LSTM()

In [0]:
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model.summary()

In [0]:
%%time
history1=model.fit(X_train, 
                    y_train, 
                    validation_data=(X_val, y_val),
                    epochs=epochs, 
                    batch_size=batch_size, 
                    verbose=1)

## Save the model weights

In [0]:
model.save_weights(WEIGHTS_FILE)

In [0]:
files.download(WEIGHTS_FILE)

## Load model weights from weights file

In [0]:
try:
  model.load_weights(WEIGHTS_FILE)
  print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
  print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))

## Running model inference on the test data

In [0]:
X_test = tokenizer.texts_to_sequences(test_text)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

In [0]:
input_sequence = np.asarray([list(X_test[1])])
y_pred_LSTM=model.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(test_text[1]))
print (np.argmax(y_pred_LSTM))

## Run inference for custom user input

In [0]:
input_string = ['This movie was horrible']
input_text = tokenizer.texts_to_sequences(input_string)
input_sequence = sequence.pad_sequences(input_text, maxlen=max_words)
y_pred_LSTM=model.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(input_string))
print (np.argmax(y_pred_LSTM))

In [0]:
input_string = ['This movie was great']
input_text = tokenizer.texts_to_sequences(input_string)
input_sequence = sequence.pad_sequences(input_text, maxlen=max_words)
y_pred_LSTM=model.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(input_string))
print (np.argmax(y_pred_LSTM))