In [None]:
#Importing packages for the necessary purposes

# Data manipulation/analysis
import numpy as np
import keras 
import pandas as pd
import tensorflow as tf

# Text preprocessing
import re
import nltk
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Modelling
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation,Flatten,Bidirectional,GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid", context='talk')

# **Importing and understanding data**

In [None]:
import pandas as pd
data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print(data.head())

In [None]:
data.describe()

#  **Preprocessing of text**

In [None]:
#Convering sentiment values to either 0 or 1
# Positive = 1 and negative = 0

def convert_sentiment(word):
    if word == 'positive':
        new_value = 1
    else:
        new_value = 0    
    return new_value

data['new_sentiment'] = data['sentiment'].apply(convert_sentiment)

In [None]:
#Removing punctutation. We use string.punctuation in python which consists of !"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data['without_punctuation'] = data['review'].apply(lambda text: remove_punctuation(text))

In [None]:
#Removing stopwords
stopword_list=nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopword_list])

data["without_stop"] = data['without_punctuation'].apply(lambda text: remove_stopwords(text))

In [None]:
#Stemming
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

data['stemmed'] = data['without_stop'].apply(lambda text: stem_words(text))

In [None]:
#Removing URLs
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

data['no_url'] = data['stemmed'].apply(lambda text: remove_url(text))

In [None]:
#Removing html strips
def remove_html(text):
    return BeautifulSoup(text, "lxml").text

data['no_html'] = data['no_url'].apply(lambda text: remove_html(text))

In [None]:
#Converting to lower case
data['final_reviews'] = data['no_html'].str.lower()

In [None]:
#Finding max and min length of reviews to decide on a suitable length to implement padding
measurer = np.vectorize(len)
max_len = measurer(data['final_reviews']).max(axis=0)
min_len = measurer(data['final_reviews']).min(axis=0)
mean_len = measurer(data['final_reviews']).mean(axis=0)

print(max_len)
print(min_len)
print(mean_len)


# **Spitting data into the train and test datasets**

In [None]:
#Finding the number of positive and negative sentiment values available
data['new_sentiment'].value_counts()

In [None]:
#Since we have a balanced dataset, we can proceed to split the dataset with 80% of data in the train dataset and 20% of data in the test dataset.
Review_train=data.final_reviews[:40000]
S_train=data.new_sentiment[:40000]

Review_test=data.final_reviews[40000:]
S_test=data.new_sentiment[40000:]

# **Processing text to be inputted into a model**

In [None]:
#Tokenization 
#We also specify the max number of words in the dictionary and a token to represent words that are out of the vocabulary/dictionary (OOV)

vocab_size= 4000
tokenizer = Tokenizer(num_words = vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(Review_train)
tokenizer.fit_on_texts(Review_test)

In [None]:
#Representing each review in terms of the numbers that represent each word in it
R_train_input = tokenizer.texts_to_sequences(Review_train)
R_test_input = tokenizer.texts_to_sequences(Review_test)

In [None]:
#Inserting padding for sequences
maxlen = 700
R_train_input = pad_sequences(R_train_input, maxlen=maxlen, padding = 'post')
R_test_input = pad_sequences(R_test_input, maxlen=maxlen, padding = 'post')

In [None]:
#Converting the data column into an array to make further implementations easier
R_train = np.array(R_train_input)
S_train = np.array(S_train)
R_test = np.array(R_test_input)
S_test = np.array(S_test)

# **Creating the model**

* Simple model 
* LSTM
* Bidirectional LSTM


In [None]:
#Simple model
embedding_dim =32
model = keras.Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlen),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
num_epochs = 20
model.fit(R_train, S_train, epochs=num_epochs,batch_size = 64, validation_split=0.1, shuffle=True)

In [None]:
model.evaluate(R_test, S_test, batch_size=64)

In [None]:
#Multiple directional lstm model
embedding_dim = 32
model_multiple_bidi_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim, input_length=maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model_multiple_bidi_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model_multiple_bidi_lstm.summary()

In [None]:
num_epochs = 5
model_multiple_bidi_lstm.fit(R_train, S_train, epochs=num_epochs,batch_size = 64, validation_split=0.1, shuffle=True)

In [None]:
model_multiple_bidi_lstm.evaluate(R_test, S_test, batch_size=64)