In [2]:
from lib import data_split, features_word2vec, model_lstm, model_randomforest
import pandas as pd
import os

# Read data
# Use the kaggle Bag of words vs Bag of popcorn data:
# The data is downloaded from: 
# https://www.kaggle.com/c/word2vec-nlp-tutorial/data
data = pd.read_csv("./data/labeledTrainData.tsv", header=0,
                   delimiter="\t", quoting=3, encoding="utf-8")

print("The labeled training set dimension is:\n")
print(data.shape)

data2 = pd.read_csv("./data/unlabeledTrainData.tsv", header=0,
                          delimiter="\t", quoting=3, encoding="utf-8")

print("The unlabeled training set dimension is:\n")
print(data.shape)

# Labeled data(data) and Unlabeled data(data2) 
# are combined to train the word2vec model
data2.append(data)
print(data2.shape)

Couldn't import dot_parser, loading of dot files will not be possible.
The labeled training set dimension is:

(25000, 3)
The unlabeled training set dimension is:

(25000, 3)
(50000, 2)


Using Theano backend.


In [3]:
# Defined path of word2vec model 
model_path = "./model/300features_40minwords_10context"

# If we have a pre-trained model we'd like to use, it can be loaded here directly. 
# Otherwise we will use the existing data to train it from scratch 
if not os.path.isfile(model_path):
    model = features_word2vec.get_word2vec_model(data2, "review", num_features=300, downsampling=1e-3, model_name=model_path)
else:
    # After model is created, we can load it as an existing file
    model = features_word2vec.load_word2vec_model(model_name=model_path)

In [4]:
# Create word embeddings, which is essentially a dictionary
# that maps word indices to word2vec features
embedding_weights = features_word2vec.create_embedding_weights(model)
print(embedding_weights.shape)

(34121, 300)


In [5]:
# We also need to prepare the word2vec features, so that they are 
# each word is now mapped to an index, consistents with the training embedding 
# Currently, we are limiting each review article to 500 words. 
# By default, we pad the LHS of each vector with zeros.  
# e.g [ 0, 0, 0 .... 0.27, 0.89, 0.35]
features = features_word2vec.get_indices_word2vec(data, "review", model, maxLength=500,
                         writeIndexFileName="./model/imdb_indices.pickle", padLeft=True )

print(embedding_weights.shape)

(34121, 300)


In [6]:
# Now we separate data for training and validation 
y = data["sentiment"]
X_train, y_train, X_test, y_test = data_split.train_test_split_shuffle(y, features, test_size = 0.1)

In [7]:
# Here we are going to run a few experiments: 
# 1. classify IMDB data with LSTM + word2vec embedding only 
# 2. classify IMDB data with LSTM + one layer of CNN + word2vec embedding
# 3. classify IMDB data with LSTM, no embedding 
# 4. Look back at the baseline, IMDB data + random forest

In [8]:
# 1. classify IMDB data with LSTM + word2vec embedding only 
# Accuracy is 0.8692 after three iterations 
model_lstm.classif_imdb( X_train, y_train, X_test, y_test, embedding_weights = embedding_weights, 
                        dense_dim = 256, nb_epoch = 3 )

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


  if embedding_weights == None and word2vec_model != None:
  elif embedding_weights == None and word2vec_model == None:


In [9]:
# 2. classify IMDB data with LSTM + one layer of CNN + word2vec embedding
# Accuracy is 0.8904 after two iterations. 
# Compare this with the keras LSTM + CNN code, without pre-embedding: 
# https://github.com/fchollet/keras/blob/master/examples/imdb_cnn_lstm.py 
# The accuracy there was 0.8498. 
model_lstm.classif_imdb(X_train, y_train, X_test, y_test, embedding_weights=embedding_weights, dense_dim=256,
                        nb_epoch=3, include_cnn = True)

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
# 3. classify IMDB data with LSTM, no embedding 
model_lstm.classif_imdb( X_train, y_train, X_test, y_test, embedding_weights = None, dense_dim = 256, nb_epoch = 3 )

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
# 4. Look back at the baseline, IMDB data + random forest
# Compare with RF
# Accuracy here is 0.815 
features_avg_word2vec = features_word2vec.get_avgfeatures_word2vec(data, "review", model)
X_train, y_train, X_test, y_test = data_split.train_test_split_shuffle(y, features_avg_word2vec, test_size=0.1)
model_randomforest.classif(X_train, y_train, X_test, y_test)

0.8148
[[ 992  258]
 [ 205 1045]]
