In [1]:
#required imports 
import numpy as np
import re
import itertools
from collections import Counter
import mxnet as mx
import numpy as np 
import cv2
import matplotlib.pyplot as plt
import logging
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib notebook

from bhtsne import tsne


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [2]:
#loading the data from files. 
def read_files(foldername):
    import os
    sentiments = []
    filenames = os.listdir(os.curdir+ "/"+foldername)
    for file in filenames:
        with open(foldername+"/"+file,"r", encoding="utf8") as pos_file:
            data=pos_file.read().replace('\n', '')
            sentiments.append(data)
    return sentiments
    
    
foldername = "easy/pos"
postive_sentiment = read_files(foldername)

foldername = "easy/neg"
negative_sentiment = read_files(foldername)

positive_labels = [1 for _ in postive_sentiment]
negative_labels = [0 for _ in negative_sentiment]

In [3]:
#some string preprocessing
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ", string)
    string = re.sub(r"\)", " ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r",", " ", string)
    string = re.sub(r"'", " ", string)
    string = re.sub(r"\[", " ", string)
    string = re.sub(r"\]", " ", string)
    return string.strip().lower()

In [4]:
#create a dict of word and their count in entrie dataset{word:count}
word_counter = Counter()
def create_count(sentiments):
    idx = 0
    for line in sentiments:
        for word in (clean_str(line)).split():
            if word not in word_counter.keys():               
                word_counter[word] = 1
            else:
                word_counter[word] += 1

#Assigns a unique a number for each word (sorted by descending order based on the frequency of occurrence)
# and returns a word_dict
def create_word_index():
    idx = 0
    word_dict = {}
    for word in word_counter.most_common():
        word_dict[word[0]] = idx
        idx+=1
    return word_dict
    


model_path = "../models/"
all_sentiments = postive_sentiment + negative_sentiment
all_labels = positive_labels + negative_labels
create_count(all_sentiments)
word_dict = create_word_index()
#create a reverse index from a number to the word 
idx2word = {v: k for k, v in word_dict.items()}


In [5]:
#Creates a encoded sentences. 
#Assigns the unique id from wordict to the words in the sentences
def encoded_sentences(input_file,word_dict):
    output_string = []
    for line in input_file:
        output_line = []
        for word in (clean_str(line)).split():
            if word in word_dict:
                output_line.append(word_dict[word])
        output_string.append(output_line)
    return output_string

def decode_sentences(input_file,word_dict):
    output_string = []
    for line in input_file:
        output_line = ''
        for idx in line:
            output_line += idx2word[idx] + ' '
        output_string.append(output_line)
    return output_string

#Pad the sequences to maxlen.
#if sentences is greater than maxlen, truncates the sentences
#if sentences is less the 500, pads with value 0 (most commonly occurrning word)
def pad_sequences(sentences,maxlen=500,value=0):
    """
    Pads all sentences to the same length. The length is defined by maxlen.
    Returns padded sentences.
    """
    padded_sentences = []
    for sen in sentences:
        new_sentence = []
        if(len(sen) > maxlen):
            new_sentence = sen[:maxlen]
            padded_sentences.append(new_sentence)
        else:
            num_padding = maxlen - len(sen)
            new_sentence = np.append(sen,[value] * num_padding)
            padded_sentences.append(new_sentence)
    return padded_sentences

In [6]:
#Encodes the positive sentiment into sequence of number.
positive_encoded = encoded_sentences(postive_sentiment,word_dict)
negative_encoded = encoded_sentences(negative_sentiment,word_dict)

all_encoded = positive_encoded + negative_encoded

In [7]:
#setting the vocab size of the dictionary
#sequence length
#the embedding dimensions
vocab_size = 5000
seq_len = 500
embedding_dim = 50

#trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in train_set]
#test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in test_set]
t_data = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in all_encoded]

In [8]:
#train+validation, test split
X_train_val, X_test, y_train_val, y_test_set = train_test_split(t_data, all_labels, test_size=0.3, random_state=42)

In [9]:
#train, validation split of data
X_train, X_val, y_train, y_validation = train_test_split(X_train_val, y_train_val, test_size=0.3, random_state=42)

In [10]:
#statistics of sentences before padding
min_len = min(map(len, t_data))
max_len = max(map(len,t_data))
avg_len = sum(map(len,t_data)) / len(t_data)
print("the minimum length is:",min_len)
print("the maximum length is:",max_len)
print("the average length is:",avg_len)

the minimum length is: 10
the maximum length is: 2564
the average length is: 245.28632


In [11]:
#padding of sentences
trn = np.array(pad_sequences(X_train, maxlen=seq_len, value=0))
val = np.array(pad_sequences(X_val, maxlen=seq_len, value=0))
test = np.array(pad_sequences(X_test, maxlen=seq_len, value=0))
y_trn = np.array(y_train)
y_val = np.array(y_validation)
y_test = np.array(y_test_set)

In [12]:
#A simple dense model
input_x_1 = mx.sym.Variable('data')
embed_layer_1 = mx.sym.Embedding(data=input_x_1, input_dim=vocab_size, output_dim=embedding_dim, name='vocab_embed')
flatten_1 = mx.sym.Flatten(data=embed_layer_1)
fc1_1 = mx.sym.FullyConnected(data=flatten_1, num_hidden=500,name="fc1")
relu3_1 = mx.sym.Activation(data=fc1_1, act_type="relu" , name="relu3")
fc2_1 = mx.sym.FullyConnected(data=relu3_1, num_hidden=2,name="final_fc")
dense_1 = mx.sym.SoftmaxOutput(data=fc2_1, name='softmax')


In [13]:
#very useful for debugging. I am infering shapes of my network
arg_shape, output_shape, aux_shape = embed_layer_1.infer_shape(data=(1,500))
arg_shape2, output_shape2, aux_shape2 = flatten_1.infer_shape(data=(1,500))

print(arg_shape2,output_shape2)

[(1, 500), (5000, 50)] [(1, 25000)]


In [14]:
#creating train_iter,val_iter,test_iter
batch_size = 64
train_iter =mx.io.NDArrayIter(trn, y_trn, batch_size, shuffle=True)
val_iter = mx.io.NDArrayIter(val, y_val, batch_size,shuffle=True)
test_iter = mx.io.NDArrayIter(test, y_test, batch_size,shuffle=True)


In [15]:
#Create Adam optimiser
adam = mx.optimizer.create('adam')

#Checkpointing (saving the model). Make sure there is folder named model1 exist
model_prefix = 'model1/chkpt'
checkpoint = mx.callback.do_checkpoint(model_prefix)
                                       
#Loading the module API. Previously mxnet used feedforward (deprecated)                                       
model =  mx.mod.Module(
    context = mx.gpu(0),     # use GPU 0 for training; if you don't have a gpu use mx.cpu()
    symbol = dense_1,
    data_names=['data']
   )
                                       
#actually fits the model for 3 epochs.                                    
model.fit(
    train_iter,
    eval_data=val_iter, 
    batch_end_callback = mx.callback.Speedometer(batch_size, 64),
    num_epoch = 3, 
    eval_metric='acc',
    optimizer = adam,
    epoch_end_callback=checkpoint
)

  optimizer_params=optimizer_params)
INFO:root:Epoch[0] Batch [64]	Speed: 4253.96 samples/sec	accuracy=0.611538
INFO:root:Epoch[0] Batch [128]	Speed: 4884.08 samples/sec	accuracy=0.825928
INFO:root:Epoch[0] Train-accuracy=0.843006
INFO:root:Epoch[0] Time cost=2.764
INFO:root:Saved checkpoint to "model1/chkpt-0001.params"
INFO:root:Epoch[0] Validation-accuracy=0.857869
INFO:root:Epoch[1] Batch [64]	Speed: 4874.95 samples/sec	accuracy=0.907452
INFO:root:Epoch[1] Batch [128]	Speed: 4879.23 samples/sec	accuracy=0.945801
INFO:root:Epoch[1] Train-accuracy=0.948909
INFO:root:Epoch[1] Time cost=2.511
INFO:root:Saved checkpoint to "model1/chkpt-0002.params"
INFO:root:Epoch[1] Validation-accuracy=0.845633
INFO:root:Epoch[2] Batch [64]	Speed: 4872.69 samples/sec	accuracy=0.983894
INFO:root:Epoch[2] Batch [128]	Speed: 4879.86 samples/sec	accuracy=0.984863
INFO:root:Epoch[2] Train-accuracy=0.977927
INFO:root:Epoch[2] Time cost=2.514
INFO:root:Saved checkpoint to "model1/chkpt-0003.params"
INFO:root

In [16]:
# obtains the weights of embeding layer for visualizing
params = model.get_params()
print(params)
weights_dense_embed = model.get_params()[0]['vocab_embed_weight'].asnumpy()
weights_dense_embed = weights_dense_embed.astype('float64')


({'final_fc_weight': <NDArray 2x500 @cpu(0)>, 'fc1_bias': <NDArray 500 @cpu(0)>, 'fc1_weight': <NDArray 500x25000 @cpu(0)>, 'vocab_embed_weight': <NDArray 5000x50 @cpu(0)>, 'final_fc_bias': <NDArray 2 @cpu(0)>}, {})


In [17]:
#tnse visualization for first 500 words
size = 500
Y= tsne(weights_dense_embed[:size])
plt.figure(0)
plt.scatter(Y[:, 0], Y[:, 1])
for idx, (x, y) in enumerate(zip(Y[:, 0], Y[:, 1])):
    plt.annotate(idx2word[idx], xy=(x, y), xytext=(0, 0), textcoords='offset points')


<IPython.core.display.Javascript object>

In [18]:
#measuring the accuracy of our model
acc = mx.metric.Accuracy()
model.score(val_iter,acc)
print(acc)

EvalMetric: {'accuracy': 0.83301957831325302}


In [19]:
# Load the model from the checkpoint , we are loading the 10 epoch
sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3)

# Assign the loaded parameters to the module
mod = mx.mod.Module(symbol=sym, context=mx.cpu())
mod.bind(for_training=False, data_shapes=[('data', (1,500))])
mod.set_params(arg_params, aux_params)



In [20]:
from collections import namedtuple
Batch = namedtuple('Batch', ['data'])

#a simple predict function
def predict(sen):
    # compute the predict probabilities
    mod.forward(Batch([mx.nd.array(sen)]))
    prob = mod.get_outputs()[0].asnumpy()
    # print the top-5
    prob = np.squeeze(prob)
    return prob

#our custom sentences for testing
my_sen =["the movie was awesome. Loved it"]
my_sen_encoded = encoded_sentences(my_sen,word_dict)
my_sen_encoded_padded = pad_sequences(my_sen_encoded)

In [21]:
output = predict(my_sen_encoded_padded)
#predicts the review is positive with 0.90 probability
print(output)


[ 0.09290998  0.90709001]


In [22]:
#a model with convolution kernel of 5 (5-grams)
input_x_2 = mx.sym.Variable('data')
embed_layer_2 = mx.sym.Embedding(data=input_x_2, input_dim=vocab_size, output_dim=embedding_dim, name='vocab_embed')
conv_input_2 = mx.sym.Reshape(data=embed_layer_2, target_shape=(batch_size, 1, seq_len, embedding_dim))
conv1_2 = mx.sym.Convolution(data=conv_input_2, kernel=(5,embedding_dim), num_filter=100, name="conv1")
flatten_2 = mx.sym.Flatten(data=conv1_2)
fc2_2 = mx.sym.FullyConnected(data=flatten_2, num_hidden=2,name="final_fc")
convnet = mx.sym.SoftmaxOutput(data=fc2_2, name='softmax')

In [23]:
adam = mx.optimizer.create('adam')

#Checkpointing (saving the model). Make sure there is folder named model2 exist
model_prefix_2 = 'model2/chkpt'
checkpoint = mx.callback.do_checkpoint(model_prefix_2)
                                       
#Loading the module API. Previously mxnet used feedforward (deprecated)                                       
model_2 =  mx.mod.Module(
    context = mx.gpu(0),     # use GPU 0 for training; if you don't have a gpu use mx.cpu()
    symbol = convnet,
    data_names=['data']
   )
                                       
# fits the model for 3 epochs.                                    
model_2.fit(
    train_iter,
    eval_data=val_iter, 
    batch_end_callback = mx.callback.Speedometer(batch_size, 64),
    num_epoch = 3, 
    eval_metric='acc',
    optimizer = adam,
    epoch_end_callback=checkpoint
)

  optimizer_params=optimizer_params)
INFO:root:Epoch[0] Batch [64]	Speed: 6126.61 samples/sec	accuracy=0.660096
INFO:root:Epoch[0] Batch [128]	Speed: 6092.09 samples/sec	accuracy=0.843994
INFO:root:Epoch[0] Train-accuracy=0.846726
INFO:root:Epoch[0] Time cost=2.015
INFO:root:Saved checkpoint to "model2/chkpt-0001.params"
INFO:root:Epoch[0] Validation-accuracy=0.847703
INFO:root:Epoch[1] Batch [64]	Speed: 5896.95 samples/sec	accuracy=0.894712
INFO:root:Epoch[1] Batch [128]	Speed: 6083.12 samples/sec	accuracy=0.909180
INFO:root:Epoch[1] Train-accuracy=0.895089
INFO:root:Epoch[1] Time cost=2.038
INFO:root:Saved checkpoint to "model2/chkpt-0002.params"
INFO:root:Epoch[1] Validation-accuracy=0.852410
INFO:root:Epoch[2] Batch [64]	Speed: 5930.89 samples/sec	accuracy=0.927404
INFO:root:Epoch[2] Batch [128]	Speed: 6090.27 samples/sec	accuracy=0.916260
INFO:root:Epoch[2] Train-accuracy=0.920635
INFO:root:Epoch[2] Time cost=2.027
INFO:root:Saved checkpoint to "model2/chkpt-0003.params"
INFO:root

In [24]:
# a model with convolution filters of 3,4,5 (3-gram,4-grams,5-grams)
input_x_3= mx.sym.Variable('data')
embed_layer_3 = mx.sym.Embedding(data=input_x_3, input_dim=vocab_size, output_dim=embedding_dim, name='vocab_embed')
conv_input_3 = mx.sym.Reshape(data=embed_layer_3, target_shape=(batch_size, 1, seq_len, embedding_dim))


# create convolution + (max) pooling layer for each filter operation
filter_list=[3, 4, 5] # the size of filters to use

num_filter=100
pooled_outputs = []
for i, filter_size in enumerate(filter_list):
    convi = mx.sym.Convolution(data=conv_input_3, kernel=(filter_size, embedding_dim), num_filter=num_filter)
    relui = mx.sym.Activation(data=convi, act_type='relu')
    pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(seq_len - filter_size + 1, 1), stride=(1,1))
    pooled_outputs.append(pooli)

# combine all pooled outputs
total_filters = num_filter * len(filter_list)
concat = mx.sym.Concat(*pooled_outputs, dim=1)



# reshape for next layer
h_pool_3 = mx.sym.Reshape(data=concat, target_shape=(batch_size, total_filters))
fc2_3 = mx.sym.FullyConnected(data=h_pool_3, num_hidden=2,name="final_fc")
convnet_combined = mx.sym.SoftmaxOutput(data=fc2_3, name='softmax')

In [25]:
adam = mx.optimizer.create('adam')

#Checkpointing (saving the model). Make sure there is folder named model3 exist
model_prefix_3 = 'model3/chkpt'
checkpoint = mx.callback.do_checkpoint(model_prefix_3)
                                       
#Loading the module API. Previously mxnet used feedforward (deprecated)                                       
model_3 =  mx.mod.Module(
    context = mx.gpu(0),     # use GPU 0 for training; if you don't have a gpu use mx.cpu()
    symbol = convnet_combined,
    data_names=['data']
   )
                                       
#fits the model for 3 epochs.                                 
model_3.fit(
    train_iter,
    eval_data=val_iter, 
    batch_end_callback = mx.callback.Speedometer(batch_size, 64),
    num_epoch = 3, 
    eval_metric='acc',
    optimizer = adam,
    epoch_end_callback=checkpoint
)

  optimizer_params=optimizer_params)
INFO:root:Epoch[0] Batch [64]	Speed: 2070.65 samples/sec	accuracy=0.571875
INFO:root:Epoch[0] Batch [128]	Speed: 2058.23 samples/sec	accuracy=0.715576
INFO:root:Epoch[0] Train-accuracy=0.791667
INFO:root:Epoch[0] Time cost=5.960
INFO:root:Saved checkpoint to "model3/chkpt-0001.params"
INFO:root:Epoch[0] Validation-accuracy=0.821160
INFO:root:Epoch[1] Batch [64]	Speed: 2038.12 samples/sec	accuracy=0.842788
INFO:root:Epoch[1] Batch [128]	Speed: 2040.82 samples/sec	accuracy=0.863281
INFO:root:Epoch[1] Train-accuracy=0.892609
INFO:root:Epoch[1] Time cost=6.007
INFO:root:Saved checkpoint to "model3/chkpt-0002.params"
INFO:root:Epoch[1] Validation-accuracy=0.854857
INFO:root:Epoch[2] Batch [64]	Speed: 2036.18 samples/sec	accuracy=0.910817
INFO:root:Epoch[2] Batch [128]	Speed: 2040.70 samples/sec	accuracy=0.917725
INFO:root:Epoch[2] Train-accuracy=0.935268
INFO:root:Epoch[2] Time cost=6.009
INFO:root:Saved checkpoint to "model3/chkpt-0003.params"
INFO:root

In [26]:
#loads glove word embedding 
def load_glove_index(loc):
    f = open(loc,encoding="utf8")
    embeddings_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

#creates word embedding matrix
def create_emb():
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_dict.items():
        if i >= vocab_size:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
embeddings_index = load_glove_index("glove/" + 'glove.6B.50d.txt')
embedding_matrix = create_emb();

In [27]:
#visualization of word embedding.
size = 500
Y= tsne(embedding_matrix[:size])
plt.figure(1)
plt.scatter(Y[:, 0], Y[:, 1])
for idx, (x, y) in enumerate(zip(Y[:, 0], Y[:, 1])):
    plt.annotate(idx2word[idx], xy=(x, y), xytext=(0, 0), textcoords='offset points')

<IPython.core.display.Javascript object>

In [28]:
#creates a model with convolution and pretrained word embedding.
weight_matrix = mx.nd.array(embedding_matrix)
input_x_3= mx.sym.Variable('data')
the_emb_3 = mx.sym.Variable('weights') #the weight variable which will hold the pre trained embedding matrix
embed_layer_3 = mx.sym.Embedding(data=input_x_3,weight=the_emb_3, input_dim=vocab_size, output_dim=embedding_dim, name='vocab_embed')
conv_input_3 = mx.sym.Reshape(data=embed_layer_3, target_shape=(batch_size, 1, seq_len, embedding_dim))
conv1_3 = mx.sym.Convolution(data=conv_input_3, kernel=(5,embedding_dim), num_filter=100, name="conv1")
flatten_3 = mx.sym.Flatten(data=conv1_3)
fc2_3 = mx.sym.FullyConnected(data=flatten_3, num_hidden=2,name="final_fc")
convnet_word2vec = mx.sym.SoftmaxOutput(data=fc2_3, name='softmax')



In [29]:
adam = mx.optimizer.create('adam')

#Checkpointing (saving the model). Make sure there is folder named model4 exist
model_prefix_3 = 'model4/chkpt'
checkpoint = mx.callback.do_checkpoint(model_prefix_3)
                                       
#Loading the module API. Previously mxnet used feedforward (deprecated)                                       
model_3 =  mx.mod.Module(
    context = mx.gpu(0),     # use GPU 0 for training; if you don't have a gpu use mx.cpu()
    symbol = convnet_word2vec,
     fixed_param_names =['weights'] # makes the weights variable non trainable. Back propagration will not update this variable
   )
                                       
#fits the model for 5 epochs.                                    
model_3.fit(
    train_iter,
    eval_data=val_iter, 
    batch_end_callback = mx.callback.Speedometer(batch_size, 64),
    num_epoch = 5, 
    eval_metric='acc',
    optimizer = adam,
    epoch_end_callback=checkpoint,
    arg_params={'weights': weight_matrix}, #loads the pretrained glove embedding to weights variable
    allow_missing=True
)

  optimizer_params=optimizer_params)
INFO:root:Epoch[0] Batch [64]	Speed: 9649.91 samples/sec	accuracy=0.630529
INFO:root:Epoch[0] Batch [128]	Speed: 11451.57 samples/sec	accuracy=0.686768
INFO:root:Epoch[0] Train-accuracy=0.674603
INFO:root:Epoch[0] Time cost=1.141
INFO:root:Saved checkpoint to "model4/chkpt-0001.params"
INFO:root:Epoch[0] Validation-accuracy=0.657191
INFO:root:Epoch[1] Batch [64]	Speed: 11729.91 samples/sec	accuracy=0.749279
INFO:root:Epoch[1] Batch [128]	Speed: 11533.78 samples/sec	accuracy=0.750244
INFO:root:Epoch[1] Train-accuracy=0.747768
INFO:root:Epoch[1] Time cost=1.057
INFO:root:Saved checkpoint to "model4/chkpt-0002.params"
INFO:root:Epoch[1] Validation-accuracy=0.649849
INFO:root:Epoch[2] Batch [64]	Speed: 8815.89 samples/sec	accuracy=0.793750
INFO:root:Epoch[2] Batch [128]	Speed: 11990.06 samples/sec	accuracy=0.790283
INFO:root:Epoch[2] Train-accuracy=0.781002
INFO:root:Epoch[2] Time cost=1.161
INFO:root:Saved checkpoint to "model4/chkpt-0003.params"
INFO: