**CSI 5138 Homework 3**

Use Vanilla RNN and LSTM to for text classification and sentiment analysis on a standard dataset of movie reviews.

# Setup

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import gensim

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Dense, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Model
#from keras import backend as K

from sklearn.metrics import accuracy_score
from keras.datasets import reuters
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Activation
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier

import os
import numpy as np

from pprint import pprint

# IMDB Movie Review Dataset, Test and Training Data

In [2]:
def processReviews(paths):
    texts = []
    ratings = []
    
    for path in paths:
        for file in os.listdir(path):
            # get review
            rating = file.split('_')[1]
            rating = rating.split('.')[0]
            file = os.path.join(path, file)
            with open(file, "r", encoding='utf-8') as f:
                text = []
                for line in f:
                    # do some pre-processing and combine list of words for each review text             
                    text += gensim.utils.simple_preprocess(line)
                texts.append(text)
                ratings.append(rating)
        
    return [texts, ratings]

In [3]:
Xtrain, ytrain = processReviews(["./aclImdb/train/neg/", "./aclImdb/train/pos/"])
Xtest, ytest = processReviews(["./aclImdb/test/neg/", "./aclImdb/test/pos/"])

In [9]:
print(Xtrain[0][:5])
print(ytrain[0])
print(Xtest[0][:5])
print(ytest[0])
print(Xtrain[12500][:5])
print(ytrain[12500])
print(Xtest[12500][:5])
print(ytest[12500])
print("# Xtrain: ", len(Xtrain))
print("# ytrain: ", len(ytrain))
print("# Xtest: ", len(Xtest))
print("# ytest: ", len(ytest))

['story', 'of', 'man', 'who', 'has']
3
['once', 'again', 'mr', 'costner', 'has']
2
['bromwell', 'high', 'is', 'cartoon', 'comedy']
9
['went', 'and', 'saw', 'this', 'movie']
10
# Xtrain:  25000
# ytrain:  25000
# Xtest:  25000
# ytest:  25000


In [5]:
X = list(Xtrain + Xtest)
y = list(ytrain + ytest)
print("# X: ", len(X))
print("# y: ", len(y))

# X:  50000
# y:  50000


# Word embedding vectors

### Try Keras with Glove

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, padding='post')

labels = to_categorical(np.asarray(y))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print(labels[0])

# reviews are on a 1-10 scale
# indices 0-3 are neg reviews <= 4, indices 6-9 are pos reviews >=7
labels = labels[:,1:]
print('Shape of label tensor:', labels.shape)
print(labels[0])

print(data[1])

Found 99476 unique tokens.
Shape of data tensor: (50000, 2380)
Shape of label tensor: (50000, 11)
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
Shape of label tensor: (50000, 10)
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[3776  523   12 ...    0    0    0]


In [21]:
embeddings_index = {}
glove_file = './glove.6B/glove.6B.100d.txt'

with open(glove_file, "r", encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [22]:
EMBEDDING_DIM=100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [23]:
print(len(word_index))
print(len(embedding_matrix))

99476
99477


In [24]:
#print(list(word_index.items())[100])
# sentence word_index = 101
print(word_index['king'])
print(word_index['queen'])
print(embedding_matrix[682][:5])
print(embedding_matrix[1577][:5])
print(embedding_matrix[101][:5])
print(embedding_matrix[0][:5])

682
1577
[-0.32306999 -0.87616003  0.21977     0.25268     0.22976001]
[-0.50045002 -0.70826     0.55387998  0.67299998  0.22486   ]
[ 0.056951   -0.011958    0.45949    -0.40204999  0.11432   ]
[0. 0. 0. 0. 0.]


In [25]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)

# Train, validation and test sets

In [19]:
# separate train, val and test. Take test as half val, half test.
X_train, y_train = data[:25000], labels[:25000]
X_val, y_val = data[25000:37500], labels[25000:37500]
X_test, y_test = data[37500:], labels[37500:]

In [20]:
print("# x_train: ", len(X_train))
print("# x_val: ", len(X_val))
print("# x_test: ", len(X_test))

# x_train:  25000
# x_val:  12500
# x_test:  12500


# Models

### Vanilla RNN

In [26]:



# parameters for data load
num_words = len(word_index)
print("num_words :", num_words)

def vanilla_rnn(num_words, num_outputs, state=50, lra=0.001):
    model = Sequential()
    model.add(Embedding(num_words + 1, 100, input_length=2380, trainable=False, weights=[embedding_matrix]))
    model.add(SimpleRNN(units=state, input_shape = (num_words,1), return_sequences = False))
    model.add(Dense(num_outputs, activation='softmax'))
    
    adam = optimizers.Adam(lr = lra)
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
    
    return model



num_words : 99476
Train on 25000 samples, validate on 12500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

KeyboardInterrupt: 

### LSTM

# Setup Models

In [None]:
def runModel(epoch=100, batch=50, state=50, lr=0.001, test=False):
    
    if test
        epoch=10
    
    model = vanila_rnn(num_words=len(word_index), num_outputs=10, state=state, lra=lr)
    model.summary()
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epoch, batch_size=batch, verbose=1)

    testscore = model.evaluate(X_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    
    # save history to file
    filename = "epoch_"+epoch+"_batch_"+batch+"_"+"_state_"+state+"_lr_"+lr
    with open('./experiments/history_'+filename, 'w', encoding="utf-8") as fout:
        pprint(history.history, fout)
    
    # save score to file
    with open('./experiments/testscore_'+filename, 'w', encoding="utf-8") as fout:
        pprint(testscore, fout)

In [None]:
states = [20, 50, 100, 200, 500]
states = [20]
lrs = [0.1, 0.01, 0.001]
lrs = [0.001]
batches = [50, 100, 200]
epochs = [100, 200]
repeats = 10

for state in states:
    for lr in lrs:
        runModel(lr=lr, state=state)
#     for batch in batches:
#         runModel(batch=batch, state=state)
#     for epoch in epoch:
#         runModel(epoch=epoch, state=state)