In [1]:
%matplotlib inline

import sys
import pandas as pd
import numpy as np

from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, TimeDistributed, Dense, CuDNNGRU, Bidirectional

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
#set hyper parameters
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.1
TEST_SPLIT=0.1
NUM_FILTERS = 50
MAX_LEN = 64
Batch_size = 100
EPOCHS = 10
SAMPLE_SIZE = int(144000 * 1)


In [3]:
%%time

DATASET = '/home/ruan/Envs/data/stanford.csv'

df = pd.read_csv(DATASET, encoding='latin-1')
df = df.sample(SAMPLE_SIZE)
sample_mode = True

Y = df.iloc[:, 0].values

Y = to_categorical(Y,num_classes=5)

X = df.iloc[:, 5].values


CPU times: user 3.32 s, sys: 175 ms, total: 3.49 s
Wall time: 3.5 s


In [4]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
99843,0,1793754512,Thu May 14 03:52:38 PDT 2009,NO_QUERY,ajoeyismymascot,i cant go to picnic...
1201991,4,1985771200,Sun May 31 17:44:28 PDT 2009,NO_QUERY,georgexmoney,"@lisalautner ohh ok, i get it lol"
1540306,4,2180377119,Mon Jun 15 10:23:16 PDT 2009,NO_QUERY,GreggiePhat,"@AmbieAm, @SmediumShawn y'all need to quit! Li..."
976855,4,1833677563,Mon May 18 00:04:36 PDT 2009,NO_QUERY,kalenabear,is crashing at Jane's place until Thursday! Wh...
1096614,4,1970329656,Sat May 30 03:25:06 PDT 2009,NO_QUERY,daymo88,Mission Statement or something like that | ps...


In [5]:
#shuffle the data
indices = np.arange(X.shape[0])
np.random.seed(2018)
np.random.shuffle(indices)
X=X[indices]
Y=Y[indices]


In [6]:
#training set, validation set and testing set
nb_validation_samples_val = int((VALIDATION_SPLIT + TEST_SPLIT) * X.shape[0])
nb_validation_samples_test = int(TEST_SPLIT * X.shape[0])

x_train = X[:-nb_validation_samples_val]
y_train = Y[:-nb_validation_samples_val]
x_val =  X[-nb_validation_samples_val:-nb_validation_samples_test]
y_val =  Y[-nb_validation_samples_val:-nb_validation_samples_test]
x_test = X[-nb_validation_samples_test:]
y_test = Y[-nb_validation_samples_test:]


In [7]:
#use tokenizer to build vocab
tokenizer1 = Tokenizer(num_words=MAX_NUM_WORDS)

#tokenizer1.fit_on_texts(df.text)
tokenizer1.fit_on_texts(df.iloc[:, 5])
vocab = tokenizer1.word_index

x_train_word_ids = tokenizer1.texts_to_sequences(x_train)
x_test_word_ids = tokenizer1.texts_to_sequences(x_test)
x_val_word_ids = tokenizer1.texts_to_sequences(x_val)

#pad sequences into the same length
x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=MAX_LEN)
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=MAX_LEN)
x_val_padded_seqs = pad_sequences(x_val_word_ids, maxlen=MAX_LEN)


In [8]:
%%time

#slice sequences into many subsequences
x_test_padded_seqs_split=[]
for i in range(x_test_padded_seqs.shape[0]):
    split1=np.split(x_test_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_test_padded_seqs_split.append(a)
    
x_val_padded_seqs_split=[]
for i in range(x_val_padded_seqs.shape[0]):
    split1=np.split(x_val_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_val_padded_seqs_split.append(a)
   
    
x_train_padded_seqs_split=[]
for i in range(x_train_padded_seqs.shape[0]):
    split1=np.split(x_train_padded_seqs[i],8)
    a=[]
    for j in range(8):
        s=np.split(split1[j],8)
        a.append(s)
    x_train_padded_seqs_split.append(a)


CPU times: user 25.3 s, sys: 385 ms, total: 25.6 s
Wall time: 25.6 s


In [9]:
%%time

#load pre-trained GloVe word embeddings
print("Using GloVe embeddings")
glove_path = '/home/ruan/Envs/data/glove.twitter.27B.200d.txt'
embeddings_index = {}
f = open(glove_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Using GloVe embeddings
Found 1193514 word vectors.
CPU times: user 1min 20s, sys: 1.23 s, total: 1min 21s
Wall time: 1min 21s


In [10]:
#use pre-trained GloVe word embeddings to initialize the embedding layer
embedding_matrix = np.random.random((MAX_NUM_WORDS + 1, EMBEDDING_DIM))
for word, i in vocab.items():
    if i<MAX_NUM_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
        # words not found in embedding index will be random initialized.
            embedding_matrix[i] = embedding_vector
            
embedding_layer = Embedding(MAX_NUM_WORDS + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_LEN//64,
trainable=True)

In [11]:
input1 = Input(shape=(MAX_LEN//64,), dtype='int32')
embed = embedding_layer(input1)
gru1 = CuDNNGRU(NUM_FILTERS)(embed)
Encoder1 = Model(input1, gru1)

input2 = Input(shape=(8,MAX_LEN//64,), dtype='int32')
embed2 = TimeDistributed(Encoder1)(input2)
gru2 = CuDNNGRU(NUM_FILTERS)(embed2)
Encoder2 = Model(input2,gru2)

input3 = Input(shape=(8,8,MAX_LEN//64), dtype='int32')
embed3 = TimeDistributed(Encoder2)(input3)
gru3 = CuDNNGRU(NUM_FILTERS)(embed3)
preds = Dense(5, activation='softmax')(gru3)
model = Model(input3, preds)

print(Encoder1.summary())
print(Encoder2.summary())
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1, 200)            6000200   
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (None, 50)                37800     
Total params: 6,038,000
Trainable params: 6,038,000
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 8, 1)              0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 8, 50)             6038000   
_____________________________________________________________

In [12]:
sample_mode = False

EPOCHS = 2

# avoid overfitting: epsilon=0.1 
# converge quickly: epsilon=0.0001

if sample_mode == False:
    #use adam optimizer
    from keras.optimizers import Adam
    opt = Adam(
        lr=0.01, 
        beta_1=0.9, 
        beta_2=0.9, 
        epsilon=0.00001)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['acc'])
    
    #save the best model on validation set
    from keras.callbacks import ModelCheckpoint             
    savebestmodel = 'biSRNN(8,2)_stanford.h5'
    checkpoint = ModelCheckpoint(savebestmodel, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks=[checkpoint] 
                 
    model.fit(np.array(x_train_padded_seqs_split), y_train, 
              validation_data = (np.array(x_val_padded_seqs_split), y_val),
              nb_epoch = EPOCHS, 
              batch_size = Batch_size,
              callbacks = callbacks,
              verbose = 1)
    
    #use the best model to evaluate on test set
    from keras.models import load_model
    best_model= load_model(savebestmodel)
    print(best_model.evaluate(np.array(x_test_padded_seqs_split),y_test,batch_size=Batch_size))
    sys.exit(0)



Train on 115200 samples, validate on 14400 samples
Epoch 1/2
Epoch 00001: val_acc improved from -inf to 0.80007, saving model to biSRNN(8,2)_stanford.h5
Epoch 2/2
Epoch 00002: val_acc did not improve
[0.4391375875307454, 0.793819443633159]


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
from keras.callbacks import Callback
import matplotlib.pyplot as plt
from keras import backend as K

class LRFinder(Callback):
    
    def __init__(self, min_lr=1e-5, max_lr=1e-2, steps_per_epoch=None, epochs=None):
        super().__init__()
        
        self.min_lr = min_lr
        self.max_lr = max_lr
        self.total_iterations = steps_per_epoch * epochs
        self.iteration = 0
        self.history = {}
        
    def clr(self):
        '''Calculate the learning rate.'''
        x = self.iteration / self.total_iterations 
        return self.min_lr + (self.max_lr-self.min_lr) * x
        
    def on_train_begin(self, logs=None):
        '''Initialize the learning rate to the minimum value at the start of training.'''
        logs = logs or {}
        K.set_value(self.model.optimizer.lr, self.min_lr)
        
    def on_batch_end(self, epoch, logs=None):
        '''Record previous batch statistics and update the learning rate.'''
        logs = logs or {}
        self.iteration += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.iteration)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
            
        K.set_value(self.model.optimizer.lr, self.clr())
 
    def plot_lr(self):
        '''Helper function to quickly inspect the learning rate schedule.'''
        plt.plot(self.history['iterations'], self.history['lr'])
        plt.yscale('log')
        plt.xlabel('Iteration')
        plt.ylabel('Learning rate')
        
    def plot_loss(self):
        '''Helper function to quickly observe the learning rate experiment results.'''
        plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
        plt.plot(self.history['lr'], self.history['loss'])
        plt.xscale('log')
        plt.xlabel('Learning rate')
        plt.ylabel('Loss')
        
        
#use adam optimizer
from keras.optimizers import Adam

opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
metrics=['acc'])

lr_finder = LRFinder(min_lr=1e-5,
                                 max_lr=1e-2,
                                 steps_per_epoch=np.ceil(EPOCHS/Batch_size),
                                 epochs=3)

model.fit(np.array(x_train_padded_seqs_split), y_train, 
          validation_data = (np.array(x_val_padded_seqs_split), y_val),
          nb_epoch = EPOCHS, 
          batch_size = Batch_size,
          callbacks = [lr_finder],
         verbose = 0)

lr_finder.plot_loss()

plt.show()


In [None]:
pd.set_option('display.max_colwidth', -1)

test_results = []
for b2 in [0.825, 0.9, 0.999]:
    print('beta_2 : ',b2)
    opt = Adam(
               lr=0.01, 
               beta_1=0.9, 
               beta_2=b2, 
               epsilon=1.0
              )
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['acc'])
    
    #save the best model on validation set
    from keras.callbacks import ModelCheckpoint             
    savebestmodel = 'biSRNN(8,2)_stanford.h5'
    checkpoint = ModelCheckpoint(savebestmodel, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks=[checkpoint] 
                 
    model.fit(np.array(x_train_padded_seqs_split), y_train, 
              validation_data = (np.array(x_val_padded_seqs_split), y_val),
              nb_epoch = EPOCHS, 
              batch_size = Batch_size,
              callbacks = callbacks,
              verbose = 0)
    
    #use the best model to evaluate on test set
    from keras.models import load_model
    best_model= load_model(savebestmodel)
    loss, acc = tuple(best_model.evaluate(np.array(x_test_padded_seqs_split),y_test,batch_size=Batch_size))
    test_results.append([b2, str(loss), str(acc)])

In [None]:
headers = ['beta_2','loss','acc']
test_results_df = pd.DataFrame(test_results,columns=headers)
test_results_df.sort_values(by='loss')
test_results_df.head()