## Import Modules

In [1]:
import numpy as np
import pandas as pd
import time
import re

import tensorflow as tf
import tensorflow_hub as hub
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


## Load Train and Test Data

In [2]:
df_train = pd.read_csv(r"Datasets/imdb_master_train.csv")
df_train.head()

Unnamed: 0.1,Unnamed: 0,type,review,sentiment,Processed_Reviews
0,25000,train,Story of a man who has unnatural feelings for ...,0,story of man who ha unnatural feeling for pig ...
1,25001,train,Airport '77 starts as a brand new luxury 747 p...,0,airport 77 start a brand new luxury 747 plane ...
2,25002,train,This film lacked something I couldn't put my f...,0,this film lacked something couldn put my finge...
3,25003,train,"Sorry everyone,,, I know this is supposed to b...",0,sorry everyone know this is supposed to be an ...
4,25004,train,When I was little my parents took me along to ...,0,when wa little my parent took me along to the ...


In [3]:
df_test = pd.read_csv(r"Datasets/imdb_master_test.csv")
df_test.head()

Unnamed: 0.1,Unnamed: 0,type,review,sentiment,Processed_Reviews
0,0,test,Once again Mr. Costner has dragged out a movie...,0,once again mr costner ha dragged out movie for...
1,1,test,This is an example of why the majority of acti...,0,this is an example of why the majority of acti...
2,2,test,"First of all I hate those moronic rappers, who...",0,first of all hate those moronic rapper who cou...
3,3,test,Not even the Beatles could write songs everyon...,0,not even the beatles could write song everyone...
4,4,test,Brass pictures (movies is not a fitting word f...,0,brass picture movie is not fitting word for th...


## Format Dataframe

In [4]:
train_df = pd.DataFrame()
train_df["text"] = df_train["Processed_Reviews"]
train_df["label"] = df_train['sentiment']
train_df['text'] = train_df['text'].astype(str)

test_df = pd.DataFrame()
test_df["text"] = df_test["Processed_Reviews"]
test_df["label"] = df_test['sentiment']
test_df['text'] = test_df['text'].astype(str)

In [5]:
# Only take up to 100 words for memory

train_df['text'] = [' '.join(t.split()[0:100]) for t in train_df['text']]
test_df['text'] = [' '.join(t.split()[0:100]) for t in train_df['text']]

In [7]:
%%time

'''
Tokenizing the text
- num_words: the maximum number of words to keep
- oov_token: if given, it will be added to word_index and used to replace out-of-vocabulary words during text_to_sequence calls
'''
# We will keep only the top max_words number of words (high-frequency tokens) from the dataset.
# This will be used to define the fixed length of the feature vectors.
max_words = 20000 

tokenizer = Tokenizer(num_words = max_words, oov_token = '<OOV>')

# Fit the Tokenizer object on the training data.
# This updates internal vocabulary based on a list of tokenized texts.
tokenizer.fit_on_texts(df_train['Processed_Reviews'])

CPU times: user 4.7 s, sys: 29 ms, total: 4.73 s
Wall time: 4.73 s


In [8]:
'''
Full list of words are available through the "word_index" property of tokenizer.
It returns a dictionary of key-value pairs, in which each word is a key,
and its index is a value.

'''
word_index = tokenizer.word_index
print("Number of unique words (tokens): %d" % len(word_index))

# Print the index of the word "the"
print("\nIndex of the word 'the':", word_index.get("the"))

vocab_size = len(word_index) + 1
print("\nSize of vocabulary: ", vocab_size)

Number of unique words (tokens): 72262

Index of the word 'the': 2

Size of vocabulary:  72263


## Convert Sentence to Elmo Vectors

In [12]:
embed = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [13]:
y = list(train_df['label'])
x = list(train_df['text'])

le = preprocessing.LabelEncoder()
le.fit(y)

def encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def decode(le, one_hot):
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)


x_enc = x
y_enc = encode(le, y)

In [14]:
y_test = list(test_df['label'])
x_test = list(test_df['text'])

le = preprocessing.LabelEncoder()
le.fit(y_test)

x_test_enc = x_test
y_test_enc = encode(le, y_test)

## Create Train and Test sets

In [15]:
x_train, x_val, y_train, y_val = train_test_split(np.asarray(x_enc), np.asarray(y_enc), test_size=0.2, random_state=42)
  
x_test = np.asarray(x_test_enc)
y_test = np.asarray(y_test_enc)

## Train Keras neural model with ELMO Embeddings

In [17]:
from keras.layers import Input, Lambda, Dense
from keras.models import Model
import keras.backend as K

def ELMoEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

input_text = Input(shape=(1,), dtype=tf.string)

embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)

dense = Dense(256, activation='relu')(embedding)

pred = Dense(2, activation='sigmoid')(dense)

model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

t0 = time.time()

history = model.fit(x_train, y_train, epochs=5, batch_size=20, verbose= True, validation_data= (x_val, y_val))

    
t1 = time.time()

W0107 15:32:44.874989 47023890714752 deprecation_wrapper.py:119] From /home/netthinker/rdeuja/.conda/envs/dev_env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.

W0107 15:32:44.876102 47023890714752 deprecation_wrapper.py:119] From /home/netthinker/rdeuja/.conda/envs/dev_env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:431: The name tf.is_variable_initialized is deprecated. Please use tf.compat.v1.is_variable_initialized instead.

W0107 15:32:45.149288 47023890714752 deprecation_wrapper.py:119] From /home/netthinker/rdeuja/.conda/envs/dev_env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:438: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.



Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
model.save_weights('elmo-model.h5')

## Compute Training Time

In [21]:
def convertTime(seconds):
    # Function to Convert Seconds into Hours, Minutes and Seconds
    seconds = seconds % (24 * 3600) 
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    
    if(hour == 0):
        return "{0:2.0f} min {1:2.0f} s".format(minutes, seconds) 
    
    elif(hour == 0 and minutes == 0):
        return "{1:2.0f} s".format(seconds) 
    
    else:
        return "{0:2.0f} h {1:2.0f} min {2:2.0f} s".format(hour, minutes, seconds)

duration_Pretraining_sec = t1-t0
duration_Pretraining = convertTime(t1 - t0)

print("\nTraining Time: ", duration_Pretraining)


Training Time:  26 min 37 s


In [22]:
model.load_weights('./elmo-model.h5')  
    
predicts = model.predict(x_test)

y_test = decode(le, y_test)
y_preds = decode(le, predicts)

print("Accuracy of ELMO is:",accuracy_score(y_test,y_preds))

Accuracy of ELMO is: 0.81992


# Results

In [25]:
from sklearn.metrics import classification_report

print(metrics.confusion_matrix(y_test, y_preds))

print(classification_report(y_test, y_preds))

print(metrics.classification_report(y_test, y_preds))

[[ 9907  2593]
 [ 1909 10591]]
              precision    recall  f1-score   support

           0       0.84      0.79      0.81     12500
           1       0.80      0.85      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

              precision    recall  f1-score   support

           0       0.84      0.79      0.81     12500
           1       0.80      0.85      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

