In [1]:
%matplotlib inline

import sys
import pandas as pd
import numpy as np

from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, TimeDistributed, Dense, CuDNNGRU, CuDNNLSTM, Bidirectional

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
#set hyper parameters
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.1
TEST_SPLIT=0.1
NUM_FILTERS = 50
MAX_LEN = 64
Batch_size = 100
EPOCHS = 10


In [3]:
%%time

DATASET = '/home/ruan/Envs/data/semeval.csv'

X_PATH = '/home/ruan/Envs/srnn/DataTrain/train_us_text.txt'
Y_PATH = '/home/ruan/Envs/srnn/DataTrain/train_us_labels.txt'

x_values = []
y_values = []
with open(X_PATH,'r') as f:
    for line in f:
        x_values.append(line.rstrip('\n'))
with open(Y_PATH,'r') as f:
    for line in f:
        try:
            y_values.append(int(line.rstrip('\n')))
        except:
            y_values.append(line.rstrip('\n'))

if len(x_values) < len(y_values):
    y_values = y_values[0:len(x_values)]
else:
    x_values = x_values[0:len(y_values)]
        
frame = { x_values[0] : x_values[1:], y_values[0] : y_values[1:] }
    
df = pd.DataFrame.from_dict(frame)

Y = df.iloc[:, 1].values

Y = to_categorical(Y,num_classes=20)

X = df.iloc[:, 0].values

CPU times: user 623 ms, sys: 61.7 ms, total: 685 ms
Wall time: 687 ms


In [4]:
df.head()

Unnamed: 0,text,labels
0,"LoL @ West Covina, California",2
1,Things got a little festive at the office #chr...,17
2,Step out and explore. # ️ @ Ellis Island Cafe,0
3,@user @ Cathedral Preparatory School,18
4,My baby bear @ Bubby's,1


In [5]:
#shuffle the data
indices = np.arange(X.shape[0])
np.random.seed(2018)
np.random.shuffle(indices)
X=X[indices]
Y=Y[indices]


In [6]:
#training set, validation set and testing set
nb_validation_samples_val = int((VALIDATION_SPLIT + TEST_SPLIT) * X.shape[0])
nb_validation_samples_test = int(TEST_SPLIT * X.shape[0])

x_train = X[:-nb_validation_samples_val]
y_train = Y[:-nb_validation_samples_val]
x_val =  X[-nb_validation_samples_val:-nb_validation_samples_test]
y_val =  Y[-nb_validation_samples_val:-nb_validation_samples_test]
x_test = X[-nb_validation_samples_test:]
y_test = Y[-nb_validation_samples_test:]

In [7]:
#use tokenizer to build vocab
tokenizer1 = Tokenizer(num_words=MAX_NUM_WORDS)

#tokenizer1.fit_on_texts(df.text)

tokenizer1.fit_on_texts(df.iloc[:, 0])
vocab = tokenizer1.word_index

x_train_word_ids = tokenizer1.texts_to_sequences(x_train)
x_test_word_ids = tokenizer1.texts_to_sequences(x_test)
x_val_word_ids = tokenizer1.texts_to_sequences(x_val)

#pad sequences into the same length
x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=MAX_LEN)
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=MAX_LEN)
x_val_padded_seqs = pad_sequences(x_val_word_ids, maxlen=MAX_LEN)


In [8]:
%%time

#slice sequences into many subsequences
x_test_padded_seqs_split=[]
for i in range(x_test_padded_seqs.shape[0]):
    split1=np.split(x_test_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_test_padded_seqs_split.append(a)
    
x_val_padded_seqs_split=[]
for i in range(x_val_padded_seqs.shape[0]):
    split1=np.split(x_val_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_val_padded_seqs_split.append(a)
   
    
x_train_padded_seqs_split=[]
for i in range(x_train_padded_seqs.shape[0]):
    split1=np.split(x_train_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_train_padded_seqs_split.append(a)

CPU times: user 1min 24s, sys: 1.26 s, total: 1min 25s
Wall time: 1min 25s


In [9]:
%%time

#load pre-trained GloVe word embeddings
print("Using GloVe embeddings")
glove_path = '/home/ruan/Envs/data/glove.twitter.27B.200d.txt'
embeddings_index = {}
f = open(glove_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Using GloVe embeddings
Found 1193514 word vectors.
CPU times: user 1min 15s, sys: 1.1 s, total: 1min 16s
Wall time: 1min 16s


In [10]:
#use pre-trained GloVe word embeddings to initialize the embedding layer
embedding_matrix = np.random.random((MAX_NUM_WORDS + 1, EMBEDDING_DIM))
for word, i in vocab.items():
    if i<MAX_NUM_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
        # words not found in embedding index will be random initialized.
            embedding_matrix[i] = embedding_vector
            
embedding_layer = Embedding(MAX_NUM_WORDS + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_LEN//64,
trainable=True)

In [11]:
input1 = Input(shape=(MAX_LEN//64,), dtype='int32')
embed = embedding_layer(input1)
gru1 = Bidirectional(CuDNNGRU(NUM_FILTERS))(embed)
Encoder1 = Model(input1, gru1)

input2 = Input(shape=(8,MAX_LEN//64,), dtype='int32')
embed2 = TimeDistributed(Encoder1)(input2)
gru2 = Bidirectional(CuDNNGRU(NUM_FILTERS))(embed2)
Encoder2 = Model(input2,gru2)

input3 = Input(shape=(8,8,MAX_LEN//64), dtype='int32')
embed3 = TimeDistributed(Encoder2)(input3)
gru3 = Bidirectional(CuDNNGRU(NUM_FILTERS))(embed3)
preds = Dense(20, activation='softmax')(gru3)
model = Model(input3, preds)


print(Encoder1.summary())
print(Encoder2.summary())
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1, 200)            6000200   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100)               75600     
Total params: 6,075,800
Trainable params: 6,075,800
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 8, 1)              0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 8, 100)            6075800   
_____________________________________________________________

In [12]:
sample_mode = False

EPOCHS = 2

# avoid overfitting: epsilon=0.1 
# converge quickly: epsilon=0.0001

if sample_mode == False:
    #use adam optimizer
    from keras.optimizers import Adam
    opt = Adam(
        lr=0.01, 
        beta_1=0.9, 
        beta_2=0.9, 
        epsilon=1e-04)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['acc'])
    
    #save the best model on validation set
    from keras.callbacks import ModelCheckpoint             
    savebestmodel = 'biSRNN(8,2)_stanford.h5'
    checkpoint = ModelCheckpoint(savebestmodel, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks=[checkpoint] 
                 
    model.fit(np.array(x_train_padded_seqs_split), y_train, 
              validation_data = (np.array(x_val_padded_seqs_split), y_val),
              nb_epoch = EPOCHS, 
              batch_size = Batch_size,
              callbacks = callbacks,
              verbose = 1)
    
    #use the best model to evaluate on test set
    from keras.models import load_model
    best_model= load_model(savebestmodel)
    loss,acc = tuple(best_model.evaluate(np.array(x_test_padded_seqs_split),y_test,batch_size=Batch_size))
    print('TEST ACCURACY: ', acc)



Train on 389423 samples, validate on 48678 samples
Epoch 1/2
Epoch 00001: val_acc improved from -inf to 0.43451, saving model to biSRNN(8,2)_stanford.h5
Epoch 2/2
Epoch 00002: val_acc improved from 0.43451 to 0.44268, saving model to biSRNN(8,2)_stanford.h5
1.8748150931796275   0.4367360332627754


In [15]:
new_model = Model(inputs=model.input,
                  outputs=model.get_layer("bidirectional_3").output)

In [16]:
%%time

x_train_svm = []

tsvm_data = 'thundersvm_train_dataset.txt'
tsvm_list = []

tsvm_test = 'thundersvm_test_dataset.txt'
tsvm_test_list = []

open(tsvm_data,'w+').close()
open(tsvm_test,'w+').close()

x_train_svm = []
for seq in x_train_padded_seqs_split:
    a = new_model.predict(np.array([seq]))
    x_train_svm.append(a[0])
    
x_val_svm = []
for seq in x_val_padded_seqs_split:
    a = new_model.predict(np.array([seq]))
    x_val_svm.append(a[0])    

x_test_svm = []
for seq in x_test_padded_seqs_split:
    a = new_model.predict(np.array([seq]))
    x_test_svm.append(a[0])


CPU times: user 40min 16s, sys: 2min 8s, total: 42min 25s
Wall time: 35min 27s


In [None]:
%%time

y_train_svm = []
for label in y_train:
    idx = np.argmax(np.array(label))
    y_train_svm.append(idx)

y_val_svm = []
for label in y_val:
    idx = np.argmax(np.array(label))
    y_val_svm.append(idx)    

y_test_svm = []
for label in y_test:
    idx = np.argmax(np.array(label))
    y_test_svm.append(idx)    

In [None]:
%%time

for i,j in enumerate(y_train_svm):
    s = '{0} '.format(j)
    for k,m in enumerate(x_train_svm[i]):
        s += '{0}:{1} '.format(k+1,m)
    tsvm_list.append(s)

tsvm_string = '\n'.join(tsvm_list)
    
with open(tsvm_data,'w') as f:
    print(tsvm_string,file=f)

for i,j in enumerate(y_test_svm):
    s = '{0} '.format(j)
    for k,m in enumerate(x_test_svm[i]):
        s += '{0}:{1} '.format(k+1,m)
    tsvm_test_list.append(s)

tsvm_test_string = '\n'.join(tsvm_test_list)
    
with open(tsvm_test,'w') as f:
    print(tsvm_test_string,file=f)