# Question No.2. NLP Dataset
A python notebookto build, train and evaluate a deep neural network on the IMDB-50K dataset.

# **1. Import Libraries/Dataset**

1a.Importing required libraries

In [None]:
#importing required libraries
import tensorflow as tf
print(tf.__version__)

import numpy as np
import tensorflow_hub as hub
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

from keras.optimizers import SGD
from tensorflow.keras import regularizers
from keras.callbacks import ModelCheckpoint 
from sklearn import metrics  
import time

import keras
import tensorflow_datasets as tfds


from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Conv1D, MaxPooling1D, LSTM
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, classification_report
import itertools
import helper
%matplotlib inline

from sklearn.metrics import confusion_matrix

1b. Import the dataset

In [None]:
train_data ,test_data  = tfds.load(name="imdb_reviews", split=['train', 'test'],  
                                  batch_size=-1, as_supervised=True)

train_examples, train_labels = tfds.as_numpy(train_data)
test_examples, test_labels = tfds.as_numpy(test_data)

1c. Check the GPU available

In [None]:
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

# **2. Data Visualization**

In [None]:
print("Categories:", np.unique(train_labels))

Output above that the dataset is labeled into two categories,  0 or 1

In [None]:
length = [len(i) for i in train_examples]
print("Average Review length:", np.mean(length))
print("Standard Deviation:", round(np.std(length)))

The average review length of review in training example is 1325 words, with a standard deviation of 1003 words. ( Huge difference between the length of the reviews)

**2a** Print at least two movie reviews from each class of the dataset, for a sanity check that labels match
the text.

In [None]:
#Printing first 10 examples
train_examples[:10]




Checking if the review matches with the label

In [None]:
train_labels[:10]


0 indicates negative sentiment where as 1 indicates positive sentiment. The reviews are matching with the sentiment as indicated

In [None]:
#Number of training and test entries
print("Training entries: {}, test entries: {}".format(len(train_examples), len(test_examples)))

The Training and test data is 50/50. 


2b. Plot a bar graph of class distribution in dataset. Each bar depicts the number of reviews belonging to a particular sentiment. 

In [None]:
fig, axs = plt.subplots(1,2,figsize=(10,5)) 
# Count plot for training set
sns.countplot(train_labels.ravel(), ax=axs[0])
axs[0].set_title('Distribution of Training data Lables')
axs[0].set_xlabel('Classes')
# Count plot for testing set
sns.countplot(test_labels.ravel(), ax=axs[1])
axs[1].set_title('Distribution of Testing data Lables')
axs[1].set_xlabel('Classes')
plt.show()

In [None]:
print("X_train shape is : ", train_examples.shape)
print("X_test shape  is : ", train_labels.shape)
print("y_train shape is : ", test_examples.shape)
print("y_test shape is : ", test_labels.shape)

# **3. Data Pre-processing**

The reviews—the arrays of strings—must be converted to into embeddings vectors before fed into the neural network.We can use a pre-trained text embedding as the first layer for text pre-processing

3.b. We will use a model from TensorFlow Hub called google/tf2-preview/nnlm-en-dim128/1 for the same[ ~1M vocabulary size and 128 dimensions]

In [None]:
# Creating a Keras layer that uses a TensorFlow Hub model to embed the sentences
model = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(model, output_shape=[128], input_shape=[], 
                           dtype=tf.string, trainable=True)

# **4. Model Building**

a. Add dense layers, specifying the number of units in each layer and the activation function used in the layer.

b. Add L2 regularization to all the layers.

c. Add one layer of dropout at the appropriate position and give reasons.

d. Choose the appropriate activation function for all the layers.

In [None]:
#Model Configurations

# Create a model object
SeqModel = tf.keras.Sequential()

# Layer 1 = input layer - TensorFlow Hub layer. This layer uses a pre-trained Saved Model to map a sentence into its embedding vector.
SeqModel.add(hub_layer)


# Layer 2 = hidden layer 
SeqModel.add(Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.001)))


# Add dropout of 30% to layer 3
SeqModel.add(layers.Dropout(0.3))

# Layer 3 = hidden layer 
SeqModel.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.005)))

#SeqModel.add(layers.Dropout(0.3))

# Layer 4 = output layer
SeqModel.add(Dense(1))



e. Print the model summary.

In [None]:
SeqModel.summary()

# **5.Model Compilation**

5a. Compile the model with the appropriate loss function.

5b. Use an appropriate optimizer. Give reasons for the choice of learning rate and its value.

5c. Use accuracy as metric.

In [None]:
def ModelCompile (optimi,loss):
  SeqModel.compile(optimizer=optimi,loss=loss,metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])

In [None]:
optimizer = 'adam'
bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
ModelCompile(optimizer,bce)

# **6. MODEL TRAINING**

6a. Train the model for an appropriate number of epochs (print the train and validation accuracy/loss for each epoch). Use the appropriate batch size

In [None]:
x_val = train_examples[:10000]
partial_x_train = train_examples[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

In [None]:
#Train the model
#Prints btotal time taken for training
def trainmodel(Model):
  es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
  mlp_start = time.time()
  history = Model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=2)
  mlp_end = time.time()
  mlp_took = mlp_end -mlp_start
  print("Total time taken: %s seconds"%(mlp_took))
  return(history)

In [None]:
h = trainmodel(SeqModel)

# **7. Model Evaluation**

7a. Print the final test/validation loss and accuracy

In [None]:
# Test the model 
def TestPrintResults (data, labels, Model):
  test_results = Model.evaluate(data, labels, verbose=False)
  print(f'Results - Loss: {test_results[0]} - Accuracy: {100*test_results[1]}%')

In [None]:
#Test the model after training on the test dataset
print("Test ")
TestPrintResults(test_data,test_labels,SeqModel)


In [None]:
#Test the model after training on the validation dataset
print("Validation ")
TestPrintResults(x_val,y_val,SeqModel)

In [None]:
# plot training history
def plotLosses(h): 
  plt.plot(h.history['accuracy'], label='train')
  plt.plot(h.history['val_accuracy'], label='validation')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.title('Plot of Accuracy')
  plt.legend()
  plt.show()

  plt.plot(h.history['loss'], label='train')
  plt.plot(h.history['val_loss'], label='validation')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.title('Plot of Loss')
  plt.legend()
  plt.show()


In [None]:
plotLosses(h)

7b. Print confusion matrix and classification report for the validation dataset. 

In [None]:
# Print Confusion Matrix
def printconfusionmatrix(Model):
  predictions = Model.predict_classes(test_data,verbose=0)
  cm = confusion_matrix(test_labels, predictions)
  print('Confusion matrix: \n \n', cm)
  return predictions

In [None]:
#Classification report for bag of words 
def classificationreport(predictions):
  creport=classification_report(test_labels,predictions,target_names=['Positive','Negative'])
  print("\n Classification Report \n",creport)

In [None]:
predictions = printconfusionmatrix(SeqModel)
classificationreport(predictions)


Summary for the
best and worst performing class and the overall trend
The first row of the confusion matrix is for reviews which their actual sentiment values in the test set are 1. As you can calculate, out of 25,000 reviews, the sentiment value of 12,500 of them is 1, the classifier correctly predicted 10976 of them as 1.
It means, for 10976 reviews, the actual sentiment values were 1 in the test set, and the classifier also correctly predicted those as 1. However, while the actual labels of 1524 reviews were 1, the classifier predicted those as 0.

The second row of the confusion matrix is for reviews which their actual sentiment values in the test set are 0. As you can calculate, out of 25,000 reviews, the sentiment value of 12,500 of them is 0, the classifier correctly predicted 10333 of them as 0.
It means, for 10333 reviews, the actual sentiment values were 0 in the test set, and the classifier also correctly predicted those as 0. However, while the actual labels of 2167 reviews were 0, the classifier predicted those as 1. Comparitively the sentiment for negative review is difficult to predict


# **Hyperparameter Tuning**

1. Network Depth: Change the number of hidden layers and hidden units for each layer

In [None]:

# Create a model object
SeqModel2 = tf.keras.Sequential()

# Layer 1 = input layer - TensorFlow Hub layer. This layer uses a pre-trained Saved Model to map a sentence into its embedding vector.
SeqModel2.add(hub_layer)
SeqModel2.add(layers.Dropout(0.3))

# Layer 2 = hidden layer 
#SeqModel2.add(Dense(32,kernel_regularizer=regularizers.l2(0.005)))

SeqModel2.add(Dense(1,kernel_regularizer=regularizers.l2(0.005)))


In [None]:
SeqModel2.compile(optimizer="adam",loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])
h1 = trainmodel(SeqModel2)
plotLosses(h1)

In [None]:
#Test the model after training on the Test dataset
print("Test ")
TestPrintResults(test_data,test_labels,SeqModel2)
#Test the model after training on the validation dataset
print("\nValidation ")
TestPrintResults(x_val,y_val,SeqModel2)


In [None]:
predictions = printconfusionmatrix(SeqModel2)
classificationreport(predictions)

8.2. Regularization: Train a model without regularization

In [None]:
# Create a model object
SeqModel3 = tf.keras.Sequential()

# Layer 1 = input layer - TensorFlow Hub layer. This layer uses a pre-trained Saved Model to map a sentence into its embedding vector.
SeqModel3.add(hub_layer)


# Layer 2 = hidden layer 
SeqModel3.add(Dense(1, activation='relu'))

In [None]:
SeqModel3.compile(optimizer="adam",loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])
h3 = trainmodel(SeqModel3)
plotLosses(h3)

In [None]:
#Test the model after training on the Test dataset
print("Test ")
TestPrintResults(test_data,test_labels,SeqModel3)
#Test the model after training on the validation dataset
print("\nValidation ")
TestPrintResults(x_val,y_val,SeqModel3)

In [None]:
predictions = printconfusionmatrix(SeqModel3)
classificationreport(predictions)

We can see that there is no much difference in the accuracy of SeqModel and SeqModel2 which has lesser number of layers. The model is able to predict with the same accuracy with lesser number or layers as well . Also the amount of time spent to learn the model decreases as the number of layers decreases.


Doing regularization prevents Overfitting as seen from the plots as indicated above also the model is trained on a lower dimensional dataset is computationally efficient. 