In [None]:
__author__ = "Pujun Bhatnagar"
__version__ = "Stanford, Spring 2016"

In [None]:
from __future__ import division, print_function, absolute_import
import json
from pprint import pprint
import pickle
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import math
import csv
import numpy as np

In [None]:
# setting the paths
filepath = '/home/pujun/Desktop/StanfordClasses/lstm for natural language understanding/snli_sick.jsonl'
percentage_split = .7
num_epoch = 10
saved_model_name = "SNLI_SICK_custom_embeddings_wo_extended_vocab_FC2"

In [None]:
vocab = {}
word_count = 1

def parse_data(json_data):
    global word_count
    
    X = []
    Y = []
    for d in json_data:
        current_attribute_list = []
        words = tokenized_and_lowercase = word_tokenize(d['example'].lower())
        for w in words:
            if w not in vocab:
                vocab[w] = word_count
                word_count += 1
            current_attribute_list.append(vocab[w])
        X.append(current_attribute_list)
        Y.append(d['label'])

    return (X, Y)

In [None]:
data = []
with open(filepath) as f:
    for line in f:
        data.append(json.loads(line))
    X, Y = parse_data(data)

In [None]:
print("Number of examples:", len(X))
print("Number of distinct words:", word_count)

In [None]:

with open('SNLI_SICK_data','w') as f:
    pickle.dump(data,f)

In [None]:
data_length_list = [len(eg) for eg in X]
num_words_in_longest_sentence = max(data_length_list)

In [None]:
print("Length of the biggest sentence:", num_words_in_longest_sentence)

In [None]:
num_words_in_longest_sentence = 500
num_training_examples = int(math.ceil(len(X) * percentage_split))
print(num_training_examples)
trainX = X[:num_training_examples]
trainY = Y[:num_training_examples]

testX = X[num_training_examples:]
testY = Y[num_training_examples:]

In [None]:
# Data preprocessing
# Sequence padding 
trainX = pad_sequences(trainX, maxlen=num_words_in_longest_sentence, value=0.)
testX = pad_sequences(testX, maxlen=num_words_in_longest_sentence, value=0.)

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)


In [None]:
# Network building
layer_input = tflearn.input_data([None, num_words_in_longest_sentence])
embedding = tflearn.embedding(layer_input, input_dim=word_count, output_dim=128)
lstm = tflearn.lstm(embedding, 128)
dropout = tflearn.dropout(lstm, 0.5)
softmax = tflearn.fully_connected(dropout, 2, activation='softmax')
net = tflearn.regression(softmax, optimizer='adam',
                         loss='categorical_crossentropy')

In [None]:
# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
model.fit(trainX, trainY, n_epoch=num_epoch,validation_set=(testX, testY), show_metric=True,
          batch_size=128)

In [None]:
model.save(saved_model_name)

In [None]:
model = tflearn.DNN(softmax)

In [None]:
model.load(saved_model_name)

In [None]:
# try predicting the 10th example
test = np.reshape(trainX[9], (-1, num_words_in_longest_sentence))

In [None]:
model.predict(test)