In [1]:
%tensorflow_version 1.x
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras.layers import Layer, Input, Lambda, Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model, Sequential, load_model
import keras.backend as K
import tensorflow.keras as keras

TensorFlow 1.x selected.


Using TensorFlow backend.


In [0]:
import pandas as pd 
import numpy as np
import re
import os
import random
import math
from bs4 import BeautifulSoup

In [3]:
tf.test.gpu_device_name() 

'/device:GPU:0'

In [0]:
# Load all files from a directory into dictionaries
def load_directory_data(directory, label):
    data = []
    for file_path in os.listdir(directory):
        with open(os.path.join(directory, file_path), "r") as f:
            data.append({"text": f.read().replace("<br />", " "), "label": label})
    return data

# Load the positive and negative examples from the dataset
def load_dataset(directory):
    pos_data = load_directory_data(os.path.join(directory, "pos"), 1)
    neg_data = load_directory_data(os.path.join(directory, "neg"), 0)
    return pos_data+neg_data

In [0]:
def download_and_load_datasets(force_download=False):
    dataset = keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)

    train_data = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "train"))
    test_data = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "test"))

    return train_data, test_data


In [6]:
train_data, test_data = download_and_load_datasets()

random.shuffle(train_data)
random.shuffle(test_data)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [7]:
print("len of shotest text of train_data: ", min(len(x['text']) for x in train_data ))
print("len of longest text of train_data: ", max(len(x['text']) for x in train_data ))

print("\nlen of shotest text of test_data: ", min(len(x['text']) for x in test_data ))
print("len of longest text of test_data: ", max(len(x['text']) for x in test_data ))

len of shotest text of train_data:  52
len of longest text of train_data:  13604

len of shotest text of test_data:  32
len of longest text of test_data:  12730


In [0]:
def denoise_text(text):    
    # Remove the html strips
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    
    # Remove digits and punctuations 
    pattern=r'[^a-zA-Z]'
    text=re.sub(pattern,' ',text)
    
    # Remove single characters
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
        
    return text

In [0]:
train_X, train_Y, test_X, test_Y = [], [], [], []

for e in train_data:
    train_X.append(denoise_text(e['text']))
    train_Y.append(e['label'])
    
for e in test_data:
    test_X.append(denoise_text(e['text']))
    test_Y.append(e['label'])   

In [0]:
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=False)

In [0]:

#embed = hub.Module("https://tfhub.dev/google/elmo/3")
def ELMoEmbedding(x):
    return elmo(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [0]:
def build_model(): 
    input_text = Input(shape=(1,), dtype="string")
    embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
    dense = Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001))(embedding)
    pred = Dense(1, activation='sigmoid')(dense)
    model = Model(inputs=[input_text], outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0002), metrics=['accuracy'])
    return model

In [13]:
model_elmo = build_model()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [14]:
model_elmo.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
lambda (Lambda)              (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 256)               262400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 262,657
Trainable params: 262,657
Non-trainable params: 0
_________________________________________________________________


In [15]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    history = model_elmo.fit(np.array(train_X), np.array(train_Y), epochs=10, batch_size=16, validation_split = 0.2)
    model_elmo.save_weights('./model_imdb_elmo.h5')

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
 4000/20000 [=====>........................] - ETA: 33:40 - loss: 0.4111 - acc: 0.8568

KeyboardInterrupt: ignored

In [0]:
history.history['loss']

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

#acc = history.history['acc']
#val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'b', label='Training Loss')
plt.plot(epochs, val_loss, 'g', label='Validation Loss')
plt.title('Training and validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [0]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    model_elmo.load_weights('./model_spam_elmo.h5')
    predicts = model_elmo.predict(np.array(test_X))


In [0]:
len(predicts)

In [0]:
pred_Y=[]
for i in range(len(predicts)):
    if predicts[i] >= 0.5:
        pred_Y.append(1)
    else:
        pred_Y.append(0)     

In [0]:
res = []
for i in range(len(test_Y)):
    res.append([test_Y[i],pred_Y[i]])


In [0]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
fpr, tpr, thresholds = roc_curve(test_Y, pred_Y)
auc(fpr, tpr)

In [0]:
print("precision : ", precision_score(test_Y, pred_Y, average="micro"))
print("recall : ", recall_score(test_Y, pred_Y, average="micro")) 
print("f1_score : ", f1_score(test_Y, pred_Y, average="micro"))
print("accuracy_score : ", accuracy_score(test_Y, pred_Y))
#print(confusion_matrix(test_Y, pred_Y))

In [0]:
print("precision : ", precision_score(test_Y, pred_Y, average="macro"))
print("recall : ", recall_score(test_Y, pred_Y, average="macro")) 
print("f1_score : ", f1_score(test_Y, pred_Y, average="macro"))
print("accuracy_score : ", accuracy_score(test_Y, pred_Y))