In [None]:
import numpy as np
import pandas as pd

In [None]:
dataset = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [None]:
dataset.shape
# shape of the dataset

In [None]:
dataset.columns
# columns in the dataset

In [None]:
dataset.head(n=10)
# first 10 rows of the dataset

In [None]:
import seaborn as sns

In [None]:
sns.countplot(x=dataset['sentiment'])

In [None]:
dataset['sentiment'].value_counts()
# count of each sentiment

### (2) Stop word Removal and stemming

In [None]:
from nltk.corpus import stopwords
# to dealing with stopwords

from nltk.stem import PorterStemmer
# PorterStemmer --> for stemming the text

import re
# re --> regular expression

In [None]:
sw = stopwords.words('english') # as we needs to remove english stopwords
ps = PorterStemmer() # creating an object of PorterStemmer

In [None]:
def clean_text(sample):
    sample = sample.lower()
    # lowering the entire text
    
    sample = sample.replace("<br /><br />","")
    # as this is an html text. hence it is containing '<br>'
    # So, we are replacing the <br> with "", ie, remove this <br> tags
    
    sample = re.sub("[^a-zA-Z]+"," ",sample)
    # to remove those characters which are not the alphabets and replacing them with " ".
    
    sample = sample.split()
    # to apply stopword removal and stemming, we needs to iterate over the text, which is only possible 
    # if we convert this text into a list. Hence, converting this into a list.
    
    sample = [ps.stem(s) for s in sample if s not in sw]
    # iterating over the list to perform stemming
    
    sample = " ".join(sample)
    # after stemming, re join the list back into a text.
    
    return sample

In [None]:
dataset['review'][0]
# first review with stopwords and without stemming

In [None]:
clean_text(dataset['review'][0])
# first review without stopwords and after stemming

#### Applying the stopword removal and stemming over entire dataset

In [None]:
dataset['review'] = dataset['review'].apply(clean_text)

In [None]:
dataset.head(n=10)
# dataset after the removal of stopwords

### (3) Creating the Vocab

In [None]:
max_features = 10000
# the number of words in the vocab = 10000
# 10000 is basically the number of unique words, ie. vocabulary size is 10000
# so the first 10000 relevant words will be used.

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=max_features, split=' ')
# tokenizer --> a vocab of 10000 words

tokenizer.fit_on_texts(dataset['review'].values)
# applying tokenization on the dataset, it will take first 10000 words

In [None]:
tokenizer.index_word
# it is saying that the first word in vocab is 'movi'
# second word is 'film'

In [None]:
tokenizer.word_counts
# it means 'one' has occured 55435 times in entire dataset

In [None]:
import pickle

In [None]:
# now saving this tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### (4) Creating the X, 
### X --> rows = 50000
### X --> columns = not specific, it contains different columns for each row, where column number is equals to number of words in that review (or row)

In [None]:
X = tokenizer.texts_to_sequences(dataset['review'].values)
# X --> in each row it basically contains index of those words which are there in the review

In [None]:
len(X)
# ie, it has 50000 reviews

In [None]:
len(X[0])
# ie, first review has 153 words

In [None]:
len(X[1])
# ie, second review has 79 words

In [None]:
print(X[0])
# they are the indexes of 153 words of first review

### (5) XT --> taking first 25000 reviews into the training set
###       Xt --> taking last 25000 reviews into the testing set

In [None]:
XT = X[:25000] # XT --> taking first 25000 reviews into the training set
Xt = X[25000:] # Xt --> taking last 25000 reviews into the testing set
# NOTE --> ie, we have divided X into two parts --> i) XT and ii) Xt

In [None]:
print(len(XT))
print(len(Xt))

### (6) Vectorization of reviews, ie, making each review of size 10000
#### (6.1) X_train and X_test

In [None]:
def vectorize_sentences(sentences,dim=10000):
    outputs=np.zeros((len(sentences),dim)) 
    # outputs --> (25000 x 10000) matrix
    
    for i,idx in enumerate (sentences):
        outputs[i,idx]=1
    return outputs

In [None]:
X_train = vectorize_sentences(XT)
X_test = vectorize_sentences(Xt)

In [None]:
print(X_train.shape)
print(X_test.shape)
# basically in each row we have one review and for that review we have 10000 columns
# and if that index word is present in review then it will be 1 otherwise it will be 0

#### (6.2) Y_train and Y_test
#### Encoding positive as 1 and negative as 0

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# we have two types of sentiment, so to encode them with numbers

In [None]:
y = dataset['sentiment'].values

In [None]:
y = le.fit_transform(y)

In [None]:
y[:100]
# 1--> positive
# 0--> negative
# value of first 100 reviews

In [None]:
Y_train = y[:25000] # YT --> training set of y
Y_test = y[25000:] # Yt --> test set of y

In [None]:
print(len(Y_train))
print(len(Y_test))

### (7) Spliting the training set (25000) into 
### i) train_new (20000)
### ii) validation (5000)

In [None]:
x_train_new = X_train[:20000]
x_val = X_train[20000:]

y_train_new = Y_train[:20000]
y_val = Y_train[20000:]

In [None]:
print(x_val.shape)
print(x_train_new.shape)
print(y_val.shape)
print(y_train_new.shape)

### (8) Defining the model architecture
#### Using Fully Connected/ Dense Layers with ReLu activation
#### 2 Hidden layers with 16 units each
#### 1 output layer with 1 unit (Sigmoid activation)

In [None]:
from keras import models
from keras.layers import Dense

In [None]:
model=models.Sequential()
model.add(Dense(16,activation='relu',input_shape=(10000,))) # first hidden layer having 16 neurons.
model.add(Dense(16,activation='relu')) # second hidden layer having 16 neurons.
model.add(Dense(1,activation='sigmoid')) # output layer having only 1 neuron which can be used for binary classification.

In [None]:
model.summary()
# parameters are nothing but weights and biases
# 160016 = (10000 * 16) + 16, 10000-->features || 16-->neurons in H1 layer || 16-->bias term
# 272 = (16 * 16) + 16, 16-->neurons in H1 layer || 16-->neurons in H2 layer || 16-->bias term
# 17 = (16 * 1) + 1, 16-->neurons in H2 layer || 1-->neurons in O/P layer || 1-->bias term

In [None]:
# Compile the model
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
# loss function --> binary cross entropy
# optimizer --> rmsprop (it basically helps to reduce the loss function)
# metrics --> after each epoch, we can judge our training procedure, for that judgement we are using
#             'accuracy' parameter here.

In [None]:
# Executing the model
hist = model.fit(
    x_train_new, 
    y_train_new,
    epochs=20,
    batch_size=512,
    validation_data=(x_val,y_val)
)

### (9) Visualizing the validation accuracy and loss

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.style.use('seaborn')

In [None]:
h = hist.history
# it is a dictionary having keys as 'accuracy' , 'loss', 'val_accuracy' and their values

#### (9.1) Loss vs Epoch

In [None]:
plt.plot(h['val_loss'],label="Validation Loss")
plt.plot(h['loss'],label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()
# ie, after some epochs our validation loss decreases but after that it starts to increase 
# which basically means overfitting

#### (9.2) Accuracy vs Epoch

In [None]:
plt.plot(h['val_accuracy'],label="Validation Acc")
plt.plot(h['accuracy'],label="Training Acc")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
# after some epochs, validation accuracy starts to decrease---> overfitting
# so now we will stop after some 3-4 epochs called as stop early.

### (10) So now we will run our model for 4 epochs

In [None]:
# when epochs = 4
hist = model.fit(
    x_train_new,
    y_train_new,epochs=4,
    batch_size=512,
    validation_data=(x_val,y_val)
)

In [None]:
h = hist.history

#### Loss vs Epoch

In [None]:
plt.plot(h['val_loss'],label="Validation Loss")
plt.plot(h['loss'],label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

#### Accuracy vs Epoch

In [None]:
plt.plot(h['val_accuracy'],label="Validation Acc")
plt.plot(h['accuracy'],label="Training Acc")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

### (11) Analyzing the developed model

In [None]:
x = model.evaluate(X_test,Y_test)[1]*100
x = round(x,2)
print("Accuracy on test set = ",x,"%")

In [None]:
x = model.evaluate(X_train,Y_train)[1]*100
x = round(x,2)
print("Accuracy on training set = ",x,"%")

In [None]:
result = model.predict(X_test)
# applying on test dataset
# result basically has the probability of each reviews that by what probability it can be positive(ie,1)

In [None]:
print(result)
# first review has 0.04 probability that it is positive and so on...

In [None]:
result_train = model.predict(X_train)
# applying model on training dataset

In [None]:
print(result_train)
# first review has 0.97 probability that it is positive and so on...

# ------------------------------END----------------------------------