# Import Packages

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow import keras 
from tensorflow.keras.preprocessing.sequence import pad_sequences  
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, LSTM, Embedding, Bidirectional, BatchNormalization
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.utils import shuffle, resample

# Load File

In [None]:
train=pd.read_csv("../input/sentiment-analysis-on-movie-reviews/train.tsv.zip", sep="	")
test=pd.read_csv("../input/sentiment-analysis-on-movie-reviews/test.tsv.zip", sep="	")
train.head()

In [None]:
#Shapes
train.shape, test.shape

In [None]:
#dtypes
train.dtypes

# EDA Analysis

In [None]:
labels=train.Sentiment

In [None]:
plt.figure(figsize=(13,5))
plt.bar(height=train.Sentiment.value_counts(), x=train.Sentiment.unique())
plt.show()

# Data PreProcessing

In [None]:
#Get the Phrase into a list, for easy text cleanup and manipulation
raw_phrases = train.Phrase.values
raw_phrases[0]

In [None]:
#Using regular expressions, remove IP addresses and URLs
phrases_ip_remove = [re.sub(r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', text) for text in raw_phrases]
phrases_ip_remove[0]

In [None]:
#Using regular expressions, remove special characters
phrases_spl_remove = [re.sub('[^A-Za-z \']+', '', text) for text in phrases_ip_remove]
phrases_spl_remove[0]

In [None]:
#Normalize the casing
phrases_lower=[text.lower() for text in phrases_spl_remove]
phrases_lower[0]

In [None]:
#Tokenize using word_tokenize from NLTK
phrases_word_tok0=[word_tokenize(text) for text in phrases_lower]
phrases_word_tok0[0]

In [None]:
#Remove punctuation single quote
phrases_word_tok=[]
for words in phrases_word_tok0:
    phrases_word_tok.append([word for word in words if word not in punctuation])
phrases_word_tok[0]

In [None]:
#Flatten
cleaned_phrases_train=[]
for text in phrases_word_tok:
    cleaned_phrases_train.append(" ".join(word for word in text))
cleaned_phrases_train[0]

In [None]:
#validation set
x_train, x_val, y_train, y_val = train_test_split(cleaned_phrases_train, labels, test_size=0.10, random_state=10)


In [None]:
#Training features and labels
#Padding Sequences
labels=train.Sentiment.values
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
x_train=pad_sequences(sequences, maxlen=200)
sequences = tokenizer.texts_to_sequences(x_val)
x_val=pad_sequences(sequences, maxlen=200)
x_train.shape, y_train.shape, x_val.shape, y_val.shape

In [None]:
#Function for Preprocessing for test data
def Preprocess(test):
        
    #Get the Phrase into a list, for easy text cleanup and manipulation
    raw_phrases = test.Phrase.values
    
    #Using regular expressions, remove URLs
    phrases_ip_remove = [re.sub(r'((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.|$)){4}', '', text) for text in raw_phrases]
    
       
    #Using regular expressions, remove special characters
    phrases_spl_remove = [re.sub('[^A-Za-z \']+', '', text) for text in phrases_ip_remove]
    phrases_spl_remove[0]

    #Normalize the casing
    phrases_lower=[text.lower() for text in phrases_spl_remove]
    
    #Tokenize using word_tokenize from NLTK
    phrases_word_tok0=[word_tokenize(text) for text in phrases_lower]
    
    #Remove punctuation single quote
    phrases_word_tok=[]
    for words in phrases_word_tok0:
        phrases_word_tok.append([word for word in words if word not in punctuation])
    phrases_word_tok[0]
    
    
    #Flatten
    cleaned_phrases=[]
    for text in phrases_word_tok:
        cleaned_phrases.append(" ".join(word for word in text))

    #convert to sequences
    sequences = tokenizer.texts_to_sequences(cleaned_phrases)
    test_trans=pad_sequences(sequences, maxlen=200)

    return test_trans

# Model Building and Evaluation

In [None]:
model=Sequential()
model.add(Embedding(20000, 500, input_length=x_train.shape[1])),
model.add(Bidirectional(LSTM(64,  activation='relu'))),

model.add(Flatten())

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(5, activation='softmax'))
model.summary() 

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, to_categorical(y_train), epochs=5, batch_size=289, validation_data=(x_val, to_categorical(y_val)))

In [None]:
#Plot the graph to check training and testing accuracy over the period of time
plt.figure(figsize=(13,5))
plt.title("Accuracy vs Epochs")
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Test Accuracy')
plt.legend(loc='best')
plt.show()

In [None]:
model.evaluate(x_val, to_categorical(y_val))

In [None]:
#Predict test data
test_data = Preprocess(test)
test['Sentiment'] = np.argmax(model.predict(test_data), axis=1)

# Submitt Predictions to Kaggle

In [None]:
test.loc[:,['PhraseId','Sentiment']].to_csv("./Submission.csv", index=False)