[<center><h1>**Kaggle: Movie Review Prediction**</h1></center>](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [None]:
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense,Dropout,Embedding,LSTM
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Sequential

#set random seed for the session and also for tensorflow that runs in background for keras
import random
import tensorflow as tf
tf.random.set_seed(123)
random.seed(123)

In [None]:
train = pd.read_csv("/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip", sep='\t')
test = pd.read_csv("/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip", sep='\t')
train.head()

# Function for cleaning the reviews, tokenize and lemmatize them.

This function will take each phrase iteratively and it will 
    
        remove html content
        remove non-alphabetic characters
        tokenize the sentences
        lemmatize each word to its lemma
and then return the result in the list named reviews

**Tokenization**

    Tokenization is the process of breaking up the given text into 
    units called tokens.

    e.g.: - Hello Friends, Welcome to the world of Natural 
    Language Processing

    Word Token in Sent1 are as follows

    ‘Hello’ ‘Friends’ ‘,’ ‘Welcome’ ‘to’ ‘the’ ‘world’ ‘of’ 
    ‘Natural’ ‘Language’ ‘Processing’

    Total Number of Tokens: - 11

# Stemming and Lematizing

**Stemming**

    Stemming is a process of reducing words to its root form even 
    if the root has no dictionary meaning. For eg: beautiful 
    and beautifully will be stemmed to beauti which has no
    meaning in English dictionary.

**Lemmatisation**

    Lemmatisation is a process of reducing words into their lemma 
    or dictionary. It takes into account the meaning of the word in 
    the sentence. For eg: beautiful and beautifully are lemmatised 
    to beautiful and beautifully respectively without changing the 
    meaning of the words. But, good, better and best are lemmatised 
    to good since all the words have similar meaning.


In [None]:
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['Phrase']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemmatizer = WordNetLemmatizer()
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

In [None]:
%%time
train_sentences = clean_sentences(train)
test_sentences = clean_sentences(test)
print(len(train_sentences))
print(len(test_sentences))

In [None]:
target=train.Sentiment.values
y_target=to_categorical(target)
# number of numerical values exist in y_traget's column
num_classes=y_target.shape[1]

In [None]:
X_train,X_val,y_train,y_val = train_test_split(train_sentences,y_target,test_size=0.2,stratify=y_target)

Tqdm is used for showing progressbar, as we don't get any verbose here. TQdm can also be used in model fitting insetead of verbose using callback function.

In [None]:
unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
#length of the list of unique_words 
print(len(list(unique_words)))
print("Max length of text is : ",len_max)

In [None]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_sentences)

* padding done to equalize the lengths of all input reviews. LSTM networks needs all inputs to be same length.
* Therefore reviews lesser than max length will be made equal using extra zeros at end. This is padding.

In [None]:
X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_val = sequence.pad_sequences(X_val, maxlen=len_max)
X_test = sequence.pad_sequences(X_test, maxlen=len_max)

In [None]:
print("X_training shape   : ",X_train.shape)
print("X_validation shape : ",X_val.shape)
print("X_testing shape    : ",X_test.shape)

A word embedding is a class of approaches for representing words and documents using a dense vector representation.

**Embedding Layer** requires that the input data be integer encoded, so that each word is represented by a unique integer. This data preparation step can be performed using the Tokenizer API also provided with Keras.

The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset.

We can use 300 dimensions to represent over 1 million words. Which is why, most of the renowned organizations use 300 dimension as an output dimension in embedding.

In [None]:
model=Sequential()
model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(LSTM(128,dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
model.add(LSTM(64,dropout=0.4, recurrent_dropout=0.4,return_sequences=False))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(num_classes,activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model.summary()

In [None]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_acc', patience = 2)
callback = [early_stopping]

In [None]:
%%time
history = model.fit( X_train,
                    y_train, 
                    validation_data = (X_val, y_val),
                    epochs = 20, 
                    verbose = 1,
                    batch_size = 256, 
                    callbacks = callback)

In [None]:
import matplotlib.pyplot as plt
fig1 = plt.gcf()
epoch_count = range(1, len(history.history['accuracy']) + 1)
plt.plot(epoch_count, history.history['accuracy'], 'r--')
plt.plot(epoch_count, history.history['val_accuracy'], 'b-')
plt.legend(['Training Accuracy', 'Validation Accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()
fig1.savefig('moviereviewprediction_accuracy.png')

In [None]:
import matplotlib.pyplot as plt
fig1 = plt.gcf()
epoch_count = range(1, len(history.history['loss']) + 1)
plt.plot(epoch_count, history.history['loss'], 'r--')
plt.plot(epoch_count, history.history['val_loss'], 'b-')
plt.legend(['Training Loss', 'Validation Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()
fig1.savefig('moviereviewprediction_loss.png', bbox_inches='tight')

In [None]:
test_id = test['PhraseId']

In [None]:
y_pred=model.predict_classes(X_test)

In [None]:
submission = pd.DataFrame({'PhraseId': test_id, 'Sentiment': y_pred})
submission.to_csv('movie_review_prediction_submission.csv', index=False)
submission.head()