Name: Rajat Rathi

Roll No.: 19IE10041

## Assignment 2: Sentiment Classification


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import pickle
import regex as re
import numpy as np
import pandas as pd
!pip install contractions
import contractions
from nltk.corpus import stopwords

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Dense, LSTM, Conv1D, Embedding
import keras.backend as K
from sklearn.metrics import classification_report
import tensorflow
from tensorflow import keras

### Declaring constants

In [17]:
input_length = 200
vocab_length = 103040
Embedding_dimensions = 100

### Files Path

In [18]:
ImdbDatasetPath = "/content/drive/MyDrive/MyModel/IMDB Dataset.csv"
ModelPath = '/content/drive/MyDrive/MyModel'
TokenizerPath = "/content/drive/MyDrive/MyModel/MyTokenizer2.obj"

In [20]:
DATASET_COLUMNS = ["review", "sentiment"]
DATASET_ENCODING = "ISO-8859-1"
dataset = pd.read_csv(ImdbDatasetPath, encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
dataset = dataset.iloc[1:,:]
dataset.head()

Unnamed: 0,review,sentiment
1,One of the other reviewers has mentioned that ...,positive
2,A wonderful little production. <br /><br />The...,positive
3,I thought this was a wonderful way to spend ti...,positive
4,Basically there's a family where a little boy ...,negative
5,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Replace sentiments with numbers 

In [21]:
dataset['sentiment'] = dataset['sentiment'].replace('positive', 1)
dataset['sentiment'] = dataset['sentiment'].replace('negative', 0)

### Preprocess Input

In [22]:
def preprocess_apply(review):
    review = review.lower()

    # replace <br /> (break) character with a white space
    review = re.sub(r'<br />', ' ', review)

    # replace punctuations with a white space and other symbols
    review = re.sub(r'[^\w\s]', ' ', review)

    # expand contractions
    review = contractions.fix(review)

    # replace multiple white spaces with a single white space
    review = re.sub(r'\s+', ' ', review)

    word_list = []
    stop_words = set(stopwords.words('english')) 
    for word in review.lower().split():
        if word not in stop_words and word != '':
            word_list.append(word)
    review = ' '.join(word_list)

    return review

In [23]:
dataset['processed_review'] = dataset.review.apply(preprocess_apply)

### Train-Valid-Test dataset split

In [24]:
X_data = np.array(dataset['processed_review'])
y_data = np.array(dataset['sentiment'])

X_train, X_rem, y_train, y_rem = train_test_split(X_data, y_data, train_size = 0.8)
X_dev, X_test, y_dev, y_test = train_test_split(X_rem, y_rem, test_size = 0.5)

### Preparing data for word2vec

In [25]:
Word2vec_train_data = list(map(lambda x: x.split(), X_train))

### Word2Vec Model

In [26]:
word2vec_model = Word2Vec(Word2vec_train_data, size=Embedding_dimensions, workers=8, min_count=10)

### Tokenize

In [27]:
tokenizer = Tokenizer(oov_token="<oov>")
tokenizer.fit_on_texts(X_train)
tokenizer.num_words = vocab_length

### Padding Sequences

In [28]:
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=input_length)
X_dev = pad_sequences(tokenizer.texts_to_sequences(X_dev), maxlen=input_length)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=input_length)

### Embedding Matrix Weights

In [29]:
embedding_matrix = np.zeros((vocab_length, Embedding_dimensions))

for word, token in tokenizer.word_index.items():
    # print(word, token)
    if word2vec_model.wv.__contains__(word):
        embedding_matrix[token] = word2vec_model.wv.__getitem__(word)

In [30]:
# print(X_train[0], X_train[0].shape)

### Bi-Lstm Model

In [31]:
def getModel():
    embedding_layer = Embedding(input_dim = vocab_length,
                                output_dim = Embedding_dimensions,
                                weights=[embedding_matrix],
                                input_length=input_length,
                                trainable=False)

    model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(512, dropout=0.3, return_sequences=True)),
        Conv1D(200, 1, activation='relu'),
        GlobalMaxPool1D(),
        Dense(25, activation='relu'),
        Dense(1, activation='sigmoid'),
    ],
    name="Sentiment_Model")
    return model

In [32]:
training_model = getModel()
training_model.summary()

Model: "Sentiment_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          10304000  
                                                                 
 bidirectional (Bidirectiona  (None, 200, 1024)        2510848   
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 200, 200)          205000    
                                                                 
 global_max_pooling1d (Globa  (None, 200)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 25)                5025      
                                                                 
 dense_1 (Dense)             (None, 1)             

In [33]:
# from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
# callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
#              EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

### F1 Score Calculator method

In [34]:
def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [35]:
training_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_metric])

In [36]:
# print(X_train.shape)

### Train the model

In [37]:
history = training_model.fit(
    X_train, y_train,
    batch_size=256,
    epochs=12,
    verbose=1,
)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [38]:
# import seaborn as sns
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# acc = history.history['accuracy']
# loss = history.history['loss']
# epochs = range(len(acc))

# plt.plot(epochs, acc, 'b', label='Training acc')
# # plt.plot(epochs, val_acc, 'r', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.legend()

# plt.figure()

# plt.plot(epochs, loss, 'b', label='Training loss')
# # plt.plot(epochs, val_loss, 'r', label='Validation loss')
# plt.title('Training and validation loss')
# plt.legend()

# plt.show()

In [39]:
def get_accuracy(y_test, y_pred):
    cnt = 0
    sz = len(y_test)
    for i in range(sz):
        if y_pred[i] == y_test[i]:
            cnt = cnt + 1
    return cnt/sz

### Accuracy and F1-Score on 'dev' dataset

In [40]:
y_pred = training_model.predict(X_dev)
y_pred = np.where(y_pred>=0.5, 1, 0)

print("The accuracy of validation dataset is:", get_accuracy(y_dev, y_pred)*100, "%")
print("The f1_score of validation dataset is:",classification_report(y_dev, y_pred, output_dict=True)['macro avg']['f1-score'])

The accuracy of validation dataset is: 89.56 %
The f1_score of validation dataset is: 0.895495646132402


### Saving Model on Google Drive

In [41]:
# Saving the model in MyModel.obj file
# filehandler = open("/content/drive/MyDrive/MyModel/MyModel2.obj","wb")
# pickle.dump(training_model, filehandler)
# filehandler.close()
training_model.save(ModelPath)
print("The model has been saved")



The model has been saved


### Saving Tokenizer on Google Drive

In [42]:
# Saving the tokenizer in MyTokenizer.obj file
filehandler = open(TokenizerPath, "wb")
pickle.dump(tokenizer, filehandler)
filehandler.close()

print("The tokenizer has been saved in MyTokenizer2.obj file")

The tokenizer has been saved in MyTokenizer2.obj file


### Loading Model and Tokenizer from Google Drive
> To test on 'test' dataset



In [43]:
# file = open("/content/drive/MyDrive/MyModel/MyModel2.obj",'rb')
# loaded_model = pickle.load(file)
# file.close()
loaded_model = keras.models.load_model(ModelPath, custom_objects = {'f1_metric':f1_metric})
print(loaded_model.summary())

file = open(TokenizerPath,'rb')
loadedtokenizer = pickle.load(file)
file.close()

Model: "Sentiment_Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          10304000  
                                                                 
 bidirectional (Bidirectiona  (None, 200, 1024)        2510848   
 l)                                                              
                                                                 
 conv1d (Conv1D)             (None, 200, 200)          205000    
                                                                 
 global_max_pooling1d (Globa  (None, 200)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 25)                5025      
                                                                 
 dense_1 (Dense)             (None, 1)             

In [44]:
y_pred = loaded_model.predict(X_test)
y_pred = np.where(y_pred>=0.5, 1, 0)

print("The accuracy of testing dataset is:", get_accuracy(y_test, y_pred)*100, "%")
print("The f1_score of testing dataset is:",classification_report(y_test, y_pred, output_dict=True)['macro avg']['f1-score'])

The accuracy of testing dataset is: 89.56 %
The f1_score of testing dataset is: 0.8955818061739078


# The model gave an accuracy of 89% and F1-Score of 0.89 on test dataset