In [70]:
# import libraries
import pandas as pd
import numpy as np
import os

TRAIN_CSV = 'Train.csv'
VALID_CSV = 'Valid.csv'
data_path = '/content/drive/MyDrive/Colab Notebooks/wordEmbeddings/data/movie_reviews'

MODEL_NAME1 = 'best_model_scratch.h5'
MODEL_NAME2 = 'best_model_pretrained.h5'
model_path = '/content/drive/MyDrive/Colab Notebooks/wordEmbeddings/model'

training = os.path.join(data_path, TRAIN_CSV)
validation = os.path.join(data_path, VALID_CSV)
model_path_scratch = os.path.join(model_path, MODEL_NAME1)
model_path_pretrained = os.path.join(model_path, MODEL_NAME2)


In [71]:

# reading csv files
train = pd.read_csv(training)
valid = pd.read_csv(validation)

#train_test split
x_tr, y_tr = train['text'].values, train['label'].values
x_val, y_val = valid['text'].values, valid['label'].values

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Tokenize the sentences
tokenizer = Tokenizer()
#preparing vocabulary
tokenizer.fit_on_texts(list(x_tr))

# converting text into integer sequences
x_tr_seq = tokenizer.texts_to_sequences(x_tr)
x_val_seq = tokenizer.texts_to_sequences(x_val)

# padding to prepare sequences of same length
x_tr_seq = pad_sequences(x_tr_seq, maxlen=100)
x_val_seq = pad_sequences(x_val_seq, maxlen=100)

size_of_vocabulary = len(tokenizer.word_index) + 1 #+1 for padding
print(f'Size of vocab: {size_of_vocabulary}')

# build two different NLP models of the same architecture.  The first learns
# embeddings from scratch the second uses pretrained word embeddings
from keras.models import *
from keras.layers import *
from keras.callbacks import *

model = Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,input_length=100,trainable=True))

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

#Add loss function, metrics, optimizer
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

#addingcallbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc = ModelCheckpoint(model_path_scratch, monitor='val_acc', mode='max', 
                     save_best_only=True, verbose=1)

#Print summary of model
print(model.summary())

history = model.fit(np.array(x_tr_seq),np.array(y_tr),batch_size=128,epochs=10,
                    validation_data=(np.array(x_val_seq),np.array(y_val)),
                    verbose=1,callbacks=[es,mc])


Size of vocab: 112204
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 100, 300)          33661200  
_________________________________________________________________
lstm_8 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 128)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 65        
Total params: 33,889,169
Trainable params: 33,889,169
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.86760, saving model to /

In [78]:

#loading best model
from keras.models import load_model
model = load_model(model_path_scratch)

# evaluation
_,val_acc = model.evaluate(x_val_seq,y_val,batch_size=128)
print(val_acc)

0.8737999796867371


Build version II using GloVe pretrained word embeddings. Let's load the GloVe embeddings into our environment. 

# Download file.

In [24]:
import os
import tqdm
import requests
import zipfile

URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'

def fetch_data(url=URL, target_file='/content/drive/MyDrive/Colab Notebooks/wordEmbeddings/embeddings/glove.zip', delete_zip=False):
    # if dataset exists exit
    if os.path.isfile(target_file):
        print('datasets already downloaded')
        return

        #download (large) zip file
    #for large https request on stream mode to avoid out of memory issues
    #see : http://masnun.com/2016/09/18/python-using-the-requests-module-to-download-large-files-efficiently.html
    print("**************************")
    print("  Downloading zip file")
    print("  >_<  Please wait >_< ")
    print("**************************")
    response = requests.get(url, stream=True)
    #read chunk by chunk
    handle = open(target_file, "wb")
    for chunk in tqdm.tqdm(response.iter_content(chunk_size=512)):
        if chunk:  
            handle.write(chunk)
    handle.close()  
    print("  Download completed ;) :") 
    #extract zip_file
    zf = zipfile.ZipFile(target_file)
    print("1. Extracting {} file".format(target_file))
    zf.extractall(path='/content/drive/MyDrive/Colab Notebooks/wordEmbeddings/embeddings')
    if delete_zip:
        print("2. Deleting {} file".format(dataset_name+".zip"))
        os.remove(path=zip_file)

fetch_data()

datasets already downloaded


# Construct an embedding matrix

The embedding matrix maps each word index to its corresponding embedding vector.
<img src="https://drive.google.com/uc?id=19lmu8VSTlAdWl_-YcHtZwivRIx2rd57n&authuser=scottminer1205%40gmail.com&usp=drive_fs" width=600/>


# Load embedding into memory

Takes some time to run.

In [74]:
# load the whole embedding into memory
path_to_glove_file = '/content/drive/MyDrive/Colab Notebooks/wordEmbeddings/embeddings/glove.840B.300d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


  


Found 2195884 word vectors.


Let's create an embedding matrix by assigning the vocabulary with the pretrained word embeddings.

In [75]:

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 300))
hits = 0
misses = 0

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and embedding_vector.shape[0] != 0:       
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print(f'Converted {hits} words ({misses} misses)')       


Converted 73041 words (39162 misses)


# Defining the model architecture - pretrained embeddings:

In [80]:
model = Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,
                    weights=[embedding_matrix],
                    input_length=100,trainable=False))

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

model.add(Dense(64,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

# add loss, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['acc'])

# adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc = ModelCheckpoint(model_path_pretrained, monitor='val_acc', mode='max', 
                     save_best_only=True,verbose=1)

#print summary of model
print(model.summary())



Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 300)          33661200  
_________________________________________________________________
lstm_10 (LSTM)               (None, 100, 128)          219648    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 128)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 65        
Total params: 33,889,169
Trainable params: 227,969
Non-trainable params: 33,661,200
_________________________________________________________________
None


# Train the model

In [81]:
history = model.fit(np.array(x_tr_seq),np.array(y_tr),batch_size=128,
                    epochs=10,validation_data=(np.array(x_val_seq),
                                               np.array(y_val)),verbose=1,
                                               callbacks=[es,mc])

Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.85520, saving model to /content/drive/MyDrive/Colab Notebooks/wordEmbeddings/model/best_model_pretrained.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.85520 to 0.86660, saving model to /content/drive/MyDrive/Colab Notebooks/wordEmbeddings/model/best_model_pretrained.h5
Epoch 3/10

Epoch 00003: val_acc improved from 0.86660 to 0.87400, saving model to /content/drive/MyDrive/Colab Notebooks/wordEmbeddings/model/best_model_pretrained.h5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.87400
Epoch 5/10

Epoch 00005: val_acc improved from 0.87400 to 0.87520, saving model to /content/drive/MyDrive/Colab Notebooks/wordEmbeddings/model/best_model_pretrained.h5
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.87520
Epoch 7/10

Epoch 00007: val_acc improved from 0.87520 to 0.88400, saving model to /content/drive/MyDrive/Colab Notebooks/wordEmbeddings/model/best_model_pretrained.h5
Epoch 8/10

Epoch 00008: val_acc did no

# Evaluating model performance

In [82]:
# loading best model
from keras.models import load_model
model = load_model(model_path_pretrained)

_, val_acc = model.evaluate(x_val_seq,y_val, batch_size=128)
print(val_acc)

0.8840000033378601
