## Import Modules

In [202]:
import numpy as np
import pandas as pd
import time
import re

import tensorflow_hub as hub
import tensorflow as tf
import keras

from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Load Train and Test Data

In [203]:
df_train = pd.read_csv(r"../input/sst5-dataset/SST5_master_train.csv")
# df_train.Processed_Reviews = df_train.Processed_Reviews.astype(str)
df_train.head()

Unnamed: 0.1,Unnamed: 0,label,review,type,Processed_Reviews
0,0,4,"The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .",train,the rock is destined to be the 21st century new conan and that he going to make splash even greater than arnold schwarzenegger jean claud van damme or steven segal
1,1,5,The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy is so huge that a column of words can not adequately describe co-writer/director Peter Jackson 's expanded vision of J....,train,the gorgeously elaborate continuation of the lord of the ring trilogy is so huge that column of word can not adequately describe co writer director peter jackson expanded vision of tolkien middle ...
2,2,4,"Singer/composer Bryan Adams contributes a slew of songs -- a few potential hits , a few more simply intrusive to the story -- but the whole package certainly captures the intended , er , spirit of...",train,singer composer bryan adam contributes slew of song few potential hit few more simply intrusive to the story but the whole package certainly capture the intended er spirit of the piece
3,3,3,You 'd think by now America would have had enough of plucky British eccentrics with hearts of gold .,train,you think by now america would have had enough of plucky british eccentric with heart of gold wouldhave
4,4,4,Yet the act is still charming here .,train,yet the act is still charming here


In [204]:
df_test = pd.read_csv(r"../input/sst5-dataset/SST5_master_test.csv")
df_test.head()

Unnamed: 0.1,Unnamed: 0,label,review,type,Processed_Reviews
0,9645,4,It 's a lovely film with lovely performances by Buy and Accorsi .,test,it lovely film with lovely performance by buy and accorsi
1,9646,3,"No one goes unindicted here , which is probably for the best .",test,no one go unindicted here which is probably for the best
2,9647,4,"And if you 're not nearly moved to tears by a couple of scenes , you 've got ice water in your veins .",test,and if you re not nearly moved to tear by couple of scene you ve got ice water in your vein ifyou youve
3,9648,5,"A warm , funny , engaging film .",test,warm funny engaging film
4,9649,5,"Uses sharp humor and insight into human nature to examine class conflict , adolescent yearning , the roots of friendship and sexual identity .",test,us sharp humor and insight into human nature to examine class conflict adolescent yearning the root of friendship and sexual identity insightinto


## Format Dataframe

In [205]:
train_df = pd.DataFrame()
train_df["text"] = df_train["Processed_Reviews"]
train_df["label"] = df_train['label']
train_df['text'] = train_df['text'].astype(str)

test_df = pd.DataFrame()
test_df["text"] = df_test["Processed_Reviews"]
test_df["label"] = df_test['label']
test_df['text'] = test_df['text'].astype(str)

## Remove Stopwords

In [206]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
train_df['text'] = train_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train_df['text'].head()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0                                                rock destined 21st century new conan going make splash even greater arnold schwarzenegger jean claud van damme steven segal
1    gorgeously elaborate continuation lord ring trilogy huge column word adequately describe co writer director peter jackson expanded vision tolkien middle earth cowriter
2                             singer composer bryan adam contributes slew song potential hit simply intrusive story whole package certainly capture intended er spirit piece
3                                                                                                   think america would enough plucky british eccentric heart gold wouldhave
4                                                                                                                                                     yet act still charming
Name: text, dtype: object

In [207]:
test_df['text'] = test_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test_df['text'].head()

0                                                                                    lovely film lovely performance buy accorsi
1                                                                                               one go unindicted probably best
2                                                                 nearly moved tear couple scene got ice water vein ifyou youve
3                                                                                                      warm funny engaging film
4    us sharp humor insight human nature examine class conflict adolescent yearning root friendship sexual identity insightinto
Name: text, dtype: object

In [208]:
# freq = pd.Series(' '.join(train_df['text']).split()).value_counts()[-10:]
# train_df['text'] = train_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
# train_df['text'].head()

In [209]:
# freq = pd.Series(' '.join(test_df['text']).split()).value_counts()[-10:]
# test_df['text'] = test_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
# test_df['text'].head()

## Convert Sentence to Elmo Vectors

In [210]:
embed = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [211]:
y = list(train_df['label'])
x = list(train_df['text'])

le = preprocessing.LabelEncoder()
le.fit(y)

def encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def decode(le, one_hot):
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)


x_enc = x
y_enc = encode(le, y)

In [212]:
y_test = list(test_df['label'])
x_test = list(test_df['text'])

le = preprocessing.LabelEncoder()
le.fit(y_test)

x_test_enc = x_test
y_test_enc = encode(le, y_test)

## Create Train and Test sets

In [213]:
x_train, x_val, y_train, y_val = train_test_split(np.asarray(x_enc), np.asarray(y_enc), test_size=0.2, random_state=42)
# x_train = np.asarray(x_enc)
# y_train = np.asarray(y_enc)

x_test = np.asarray(x_test_enc)
y_test = np.asarray(y_test_enc)

## Train Keras neural model with ELMO Embeddings

In [214]:
from keras.layers import Input, Lambda, Dense
from keras.models import Model
import keras.backend as K

def ELMoEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

input_text = Input(shape=(1,), dtype=tf.string)

embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)

dense = Dense(256, activation='relu')(embedding)

pred = Dense(5, activation='softmax')(dense)

model = Model(inputs=[input_text], outputs=pred)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

t0 = time.time()

with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    
    history = model.fit(x_train, y_train, epochs=5, batch_size=128, verbose= True, validation_data= (x_val, y_val))
    
    model.save_weights('.elmo-model.h5')
    
t1 = time.time()

Train on 7716 samples, validate on 1929 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Compute Training Time

In [215]:
def convertTime(seconds):
    # Function to Convert Seconds into Hours, Minutes and Seconds
    seconds = seconds % (24 * 3600) 
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    
    if(hour == 0):
        return "{0:2.0f} min {1:2.0f} s".format(minutes, seconds) 
    
    elif(hour == 0 and minutes == 0):
        return "{1:2.0f} s".format(seconds) 
    
    else:
        return "{0:2.0f} h {1:2.0f} min {2:2.0f} s".format(hour, minutes, seconds)

duration_Pretraining_sec = t1-t0
duration_Pretraining = convertTime(t1 - t0)

print("\nTraining Time: ", duration_Pretraining)


Training Time:   2 min  7 s


In [216]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())
    session.run(tf.tables_initializer())
    model.load_weights('./elmo-model.h5')  
    
    predicts = model.predict(x_test)

y_test = decode(le, y_test)
y_preds = decode(le, predicts)

print("Accuracy of ELMO is:",accuracy_score(y_test,y_preds))

Accuracy of ELMO is: 0.45776566757493187


# Results

In [217]:
print(metrics.confusion_matrix(y_test, y_preds))

print(classification_report(y_test, y_test_predicted))

print(metrics.classification_report(y_test, y_preds))

[[ 14  99   2  19   5]
 [  4 207  11  59   8]
 [  2 103  29  85  10]
 [  2  54   3 179  41]
 [  0  14   4  72  75]]
              precision    recall  f1-score   support

           1       0.64      0.10      0.17       139
           2       0.43      0.72      0.54       289
           3       0.59      0.13      0.21       229
           4       0.43      0.64      0.52       279
           5       0.54      0.45      0.49       165

   micro avg       0.46      0.46      0.46      1101
   macro avg       0.53      0.41      0.39      1101
weighted avg       0.51      0.46      0.41      1101

