In [1]:
%matplotlib inline

import sys
import pandas as pd
import numpy as np

from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, TimeDistributed, Dense, CuDNNGRU, CuDNNLSTM, Bidirectional

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
#set hyper parameters
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.1
TEST_SPLIT=0.1
NUM_FILTERS = 50
MAX_LEN = 512
Batch_size = 100
EPOCHS = 2

DATASET = '/home/ruan/Envs/data/yelp_2013.csv'

glove_path = '/home/ruan/Envs/data/glove.twitter.27B.200d.txt'
embeddings_file = 'embedding_dict.pkl'

SAMPLE_SIZE = 0.8

In [3]:
%%time

df = pd.read_csv(DATASET, encoding='latin-1')
sz = int(df.shape[0] * SAMPLE_SIZE)
df = df.sample(sz)

Y = df.stars.values-1
Y = to_categorical(Y,num_classes=5)
X = df.text.values


CPU times: user 2.76 s, sys: 116 ms, total: 2.88 s
Wall time: 2.88 s


In [4]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,text,stars,date,year
244921,2384806,2384806,i'm torn about what rating to give. the burger...,3,2013-06-10,2013
418605,4214926,4214926,"first, let me say that i desperately wanted ...",2,2013-11-03,2013
67213,641291,641291,love this place! safe well lit!\nimportant w...,5,2013-05-27,2013
252352,2461008,2461008,"this is a wonderful little place to visit, a m...",5,2013-07-21,2013
425716,4288792,4288792,went there for a drink with friends. we wanted...,2,2013-08-13,2013


In [5]:
#shuffle the data
indices = np.arange(X.shape[0])
np.random.seed(2018)
np.random.shuffle(indices)
X=X[indices]
Y=Y[indices]


In [6]:
#training set, validation set and testing set
nb_validation_samples_val = int((VALIDATION_SPLIT + TEST_SPLIT) * X.shape[0])
nb_validation_samples_test = int(TEST_SPLIT * X.shape[0])

x_train = X[:-nb_validation_samples_val]
y_train = Y[:-nb_validation_samples_val]
x_val =  X[-nb_validation_samples_val:-nb_validation_samples_test]
y_val =  Y[-nb_validation_samples_val:-nb_validation_samples_test]
x_test = X[-nb_validation_samples_test:]
y_test = Y[-nb_validation_samples_test:]

In [7]:
#use tokenizer to build vocab
tokenizer1 = Tokenizer(num_words=MAX_NUM_WORDS)

#tokenizer1.fit_on_texts(df.text)

tokenizer1.fit_on_texts(df.text)
vocab = tokenizer1.word_index

x_train_word_ids = tokenizer1.texts_to_sequences(x_train)
x_test_word_ids = tokenizer1.texts_to_sequences(x_test)
x_val_word_ids = tokenizer1.texts_to_sequences(x_val)

#pad sequences into the same length
x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=MAX_LEN)
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=MAX_LEN)
x_val_padded_seqs = pad_sequences(x_val_word_ids, maxlen=MAX_LEN)


In [8]:
%%time

#slice sequences into many subsequences
x_test_padded_seqs_split=[]
for i in range(x_test_padded_seqs.shape[0]):
    split1=np.split(x_test_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_test_padded_seqs_split.append(a)
    
x_val_padded_seqs_split=[]
for i in range(x_val_padded_seqs.shape[0]):
    split1=np.split(x_val_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_val_padded_seqs_split.append(a)
   
    
x_train_padded_seqs_split=[]
for i in range(x_train_padded_seqs.shape[0]):
    split1=np.split(x_train_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_train_padded_seqs_split.append(a)

CPU times: user 1min 8s, sys: 968 ms, total: 1min 9s
Wall time: 1min 9s


In [9]:
%%time

import pickle

glove_path = '/home/ruan/Envs/data/glove.twitter.27B.200d.txt'
embeddings_file = 'embedding_dict.pkl'

def make(f):
    embeddings_index = {}
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

try:
    open(embeddings_file,'r').close()
    with open(embeddings_file,'rb') as f:
        embeddings_index = pickle.load(f)
except:
    open(embeddings_file,'w+').close()
    f = open(glove_path)
    embeddings_index = make(f)
    with open(embeddings_file,'wb') as f:
        pickle.dump(embeddings_index,f)

CPU times: user 3.12 s, sys: 1.02 s, total: 4.15 s
Wall time: 4.24 s


In [10]:
#use pre-trained GloVe word embeddings to initialize the embedding layer
embedding_matrix = np.random.random((MAX_NUM_WORDS + 1, EMBEDDING_DIM))
for word, i in vocab.items():
    if i<MAX_NUM_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
        # words not found in embedding index will be random initialized.
            embedding_matrix[i] = embedding_vector
            
embedding_layer = Embedding(MAX_NUM_WORDS + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_LEN//64,
trainable=True)

In [11]:
input1 = Input(shape=(MAX_LEN//64,), dtype='int32')
embed = embedding_layer(input1)
gru1 = Bidirectional(CuDNNGRU(NUM_FILTERS))(embed)
gru1 = CuDNNGRU(NUM_FILTERS)(embed)
Encoder1 = Model(input1, gru1)

input2 = Input(shape=(8,MAX_LEN//64,), dtype='int32')
embed2 = TimeDistributed(Encoder1)(input2)
gru2 = Bidirectional(CuDNNGRU(NUM_FILTERS))(embed2)
gru1 = CuDNNGRU(NUM_FILTERS)(embed2)
Encoder2 = Model(input2,gru2)

input3 = Input(shape=(8,8,MAX_LEN//64), dtype='int32')
embed3 = TimeDistributed(Encoder2)(input3)
gru3 = Bidirectional(CuDNNGRU(NUM_FILTERS))(embed3)
gru3 = CuDNNGRU(NUM_FILTERS)(embed3)
preds = Dense(5, activation='softmax')(gru3)
model = Model(input3, preds)


print(Encoder1.summary())
print(Encoder2.summary())
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 8)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 8, 200)            6000200   
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (None, 50)                37800     
Total params: 6,038,000
Trainable params: 6,038,000
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 8, 8)              0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 8, 50)             6038000   
_____________________________________________________________

In [12]:
sample_mode = False


# avoid overfitting: epsilon=0.1 
# converge quickly: epsilon=0.0001

if sample_mode == False:
    #use adam optimizer
    from keras.optimizers import Adam
    opt = Adam(
        lr=0.001, 
        beta_1=0.9, 
        beta_2=0.825, 
        epsilon=1e-08)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['acc'])
    
    #save the best model on validation set
    from keras.callbacks import ModelCheckpoint             
    savebestmodel = 'biSRNN(8,2)_stanford.h5'
    checkpoint = ModelCheckpoint(savebestmodel, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks=[checkpoint] 
                 
    model.fit(np.array(x_train_padded_seqs_split), y_train, 
              validation_data = (np.array(x_val_padded_seqs_split), y_val),
              nb_epoch = EPOCHS, 
              batch_size = Batch_size,
              callbacks = callbacks,
              verbose = 1)
    
    #use the best model to evaluate on test set
    from keras.models import load_model
    best_model= load_model(savebestmodel)
    loss,acc = tuple(best_model.evaluate(np.array(x_test_padded_seqs_split),y_test,batch_size=Batch_size))
    print('TEST ACCURACY: ', acc)



Train on 299909 samples, validate on 37489 samples
Epoch 1/2
Epoch 00001: val_acc improved from -inf to 0.65801, saving model to biSRNN(8,2)_stanford.h5
Epoch 2/2
Epoch 00002: val_acc improved from 0.65801 to 0.66235, saving model to biSRNN(8,2)_stanford.h5
TEST ACCURACY:  0.6665332923828919


In [13]:
new_model = Model(inputs=model.input,
                  outputs=model.get_layer("cu_dnngru_6").output)

In [14]:
%%time

x_train_svm = []

tsvm_data = 'thundersvm_train_dataset.txt'
tsvm_list = []

tsvm_test = 'thundersvm_test_dataset.txt'
tsvm_test_list = []

open(tsvm_data,'w+').close()
open(tsvm_test,'w+').close()

x_train_svm = [ np.array(x) for x in x_train_padded_seqs_split ] 
x_val_svm = [ np.array(x) for x in x_val_padded_seqs_split ]
x_test_svm = [ np.array(x) for x in x_test_padded_seqs_split ]



CPU times: user 8.31 s, sys: 219 ms, total: 8.53 s
Wall time: 8.39 s


In [15]:
%%time

y_train_svm = [np.argmax(np.array(x)) for x in y_train]
y_val_svm = [np.argmax(np.array(x)) for x in y_val]
y_test_svm = [np.argmax(np.array(x)) for x in y_test]

CPU times: user 736 ms, sys: 2.68 ms, total: 739 ms
Wall time: 728 ms


In [16]:
%%time

x_train_svm = (lambda x: new_model.predict(np.array(x)))(x_train_svm)
x_val_svm = (lambda x: new_model.predict(np.array(x)))(x_val_svm)
x_test_svm = (lambda x: new_model.predict(np.array(x)))(x_test_svm)

CPU times: user 52.8 s, sys: 3.33 s, total: 56.2 s
Wall time: 45.8 s


In [17]:
%%time

tsvm_list = []
for i,j in enumerate(y_train_svm):
    s = '{0} '.format(j)
    for k,m in enumerate(x_train_svm[i]):
        s += '{0}:{1} '.format(k+1,m)
    tsvm_list.append(s)

tsvm_test_list = []
for i,j in enumerate(y_test_svm):
    s = '{0} '.format(j)
    for k,m in enumerate(x_test_svm[i]):
        s += '{0}:{1} '.format(k+1,m)
    tsvm_test_list.append(s)


CPU times: user 27.8 s, sys: 151 ms, total: 28 s
Wall time: 27.5 s


In [18]:
tsvm_string = '\n'.join(tsvm_list)

tsvm_test_string = '\n'.join(tsvm_test_list)
    
with open(tsvm_data,'w') as f:
    print(tsvm_string,file=f)

with open(tsvm_test,'w') as f:
    print(tsvm_test_string,file=f)