In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# !cp "gdrive/My Drive/assignments" -r saved_models/

#### Importing tensorflow

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import os
tf.__version__

'2.0.0-rc0'

#### Setting parameters

In [3]:
batch_size = 256
epochs = 150
save_dir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)

#### Preparing dataset

In [4]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer('[a-zA-Z]{2,}')
train_df = pd.read_csv('data/train.txt', delimiter='\n', header=None, names=['X'])
test_df = pd.read_csv('data/test.txt', delimiter='\n', header=None, names=['X'])

col_y = ([1] * 12500) + ([0] * 12500)
train_df.insert (1, 'Y', col_y, True)
test_df.insert (1, 'Y', col_y, True)

print (len(train_df))
print (len(test_df))
print (train_df.head())
print (test_df.tail())


25000
25000
                                                   X  Y
0  Bromwell High is a cartoon comedy. It ran at t...  1
1  Homelessness (or Houselessness as George Carli...  1
2  Brilliant over-acting by Lesley Ann Warren. Be...  1
3  This is easily the most underrated film inn th...  1
4  This is not the typical Mel Brooks film. It wa...  1
                                                       X  Y
24995  I occasionally let my kids watch this garbage ...  0
24996  When all we have anymore is pretty much realit...  0
24997  The basic genre is a thriller intercut with an...  0
24998  Four things intrigued me as to this film - fir...  0
24999  David Bryce's comments nearby are exceptionall...  0


#### Preprocessing dataset

In [5]:
import string
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[a-zA-Z]{2,}')

def preprocess (text):
    tokens = tokenizer.tokenize(text)
    tokens = [word.lower() for word in tokens]
    return (" ".join(tokens))
    
train_df['X'] = train_df.apply(lambda row: preprocess(row['X']), axis=1)
test_df['X'] = test_df.apply(lambda row: preprocess(row['X']), axis=1)
print (train_df.head())

                                                   X  Y
0  bromwell high is cartoon comedy it ran at the ...  1
1  homelessness or houselessness as george carlin...  1
2  brilliant over acting by lesley ann warren bes...  1
3  this is easily the most underrated film inn th...  1
4  this is not the typical mel brooks film it was...  1


#### Encode the words to integers

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoded_train_df = train_df.copy(deep=True)
encoded_test_df = test_df.copy(deep=True)

vocab = {}
k = 1

def encode (text):
    global k
    words = text.split()
    code = []
    for word in words:
        if word not in vocab:
            vocab[word] = k
            k += 1
        code.append(vocab[word])

    if len(code) > 200:
        code = code[:200]
    code = code + [0] * (200 - len(code))
    return code

encoded_train_df['X'] = encoded_train_df.apply(lambda row: encode(row['X']), axis=1)
encoded_test_df['X'] = encoded_test_df.apply(lambda row: encode(row['X']), axis=1)

print (encoded_train_df.head())

sample = encoded_train_df.iloc[0]['X']
print ('Sample padded sequence:', sample)

vocab_size = len(vocab)
print ('vocab_size:', vocab_size)

from sklearn.model_selection import train_test_split
encoded_train_df, encoded_val_df = train_test_split(encoded_train_df, test_size=0.2)

                                                   X  Y
0  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...  1
1  [87, 88, 89, 12, 90, 91, 92, 93, 94, 95, 96, 9...  1
2  [287, 288, 289, 203, 214, 215, 216, 290, 291, ...  1
3  [282, 3, 356, 9, 116, 357, 283, 358, 9, 164, 3...  1
4  [282, 3, 236, 9, 405, 163, 164, 283, 6, 406, 3...  1
Sample padded sequence: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 12, 20, 21, 22, 23, 9, 24, 25, 26, 27, 28, 29, 30, 1, 2, 31, 3, 32, 33, 28, 34, 35, 3, 20, 9, 36, 28, 37, 38, 9, 39, 40, 41, 42, 43, 44, 45, 46, 47, 20, 48, 9, 49, 50, 9, 51, 52, 53, 54, 27, 50, 9, 55, 56, 57, 46, 40, 58, 59, 9, 60, 23, 61, 62, 63, 64, 28, 65, 66, 9, 17, 67, 68, 8, 2, 69, 70, 71, 72, 28, 73, 74, 50, 75, 20, 62, 76, 28, 1, 2, 77, 30, 78, 79, 50, 21, 80, 81, 30, 1, 2, 3, 82, 83, 84, 85, 30, 6, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [7]:
train_x = encoded_train_df['X'].tolist()
train_y = encoded_train_df['Y'].tolist()
val_x = encoded_val_df['X'].tolist()
val_y = encoded_val_df['Y'].tolist()
test_x = encoded_test_df['X'].tolist()
val_y = encoded_test_df['Y'].tolist()

#### Creating and compiling the models

In [8]:
models = [0] * 6
models[0] = Sequential([
    layers.Embedding(100000, 128, input_length=200),
    layers.SimpleRNN(200, activation='tanh'),
    layers.Dense(1, activation='sigmoid')
])
print (models[0].summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 128)          12800000  
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 200)               65800     
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 12,866,001
Trainable params: 12,866,001
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
models[1] = Sequential([
    layers.Embedding(100000, 128, input_length=200),
    layers.LSTM(200, activation='tanh', recurrent_activation='tanh'),
    layers.Dense(1, activation='sigmoid')
])
print (models[1].summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 128)          12800000  
_________________________________________________________________
lstm (LSTM)                  (None, 200)               263200    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 13,063,401
Trainable params: 13,063,401
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
models[2] = Sequential([
    layers.Embedding(100000, 128, input_length=200),
    layers.GRU(200, activation='relu', recurrent_activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
print (models[2].summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 128)          12800000  
_________________________________________________________________
gru (GRU)                    (None, 200)               198000    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 201       
Total params: 12,998,201
Trainable params: 12,998,201
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
opt = keras.optimizers.Adam(1e-4)
for i in range(3):
    models[i].compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

#### Initialising training parameters

In [12]:
from tensorflow.keras.callbacks import EarlyStopping, History
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, 
                           verbose=0, mode='auto', baseline=None, restore_best_weights=False)
callbacks_list = [early_stop]
hist_temp = 'keras_imdb_history{}.pkl'
hist_names = [hist_temp.format(x) for x in range(1, 7)]
template = 'keras_imdb_trained_model{}.h5'
model_names = [template.format(x) for x in range(1, 7)]

In [13]:
def save_history (hist_path, history):
    print ('Saving history at', hist_path, flush=True)
    file_object = open (hist_path, 'wb')
    pickle.dump (history, file_object)
    file_object.close()
    
def load_history (hist_path):
    print ('Loading history from', hist_path, flush=True)
    file_object = open(hist_path, 'rb')
    history = pickle.load(file_object)
    file_object.close()
    return history

def train_and_evaluate(model_id):
    model = models[model_id]
    model_path = os.path.join(save_dir, model_names[model_id])
    hist_path = os.path.join(save_dir, hist_names[model_id])
    if os.path.exists(hist_path):
        assert (os.path.exists(model_path), 'Model must be saved at model_path')
        history = load_history(hist_path)
        print ('Trained model loaded from {}', model_path)
        model = keras.models.load_model(model_path)
        loss, accuracy = model.evaluate (test_x, test_y, verbose=0)
        print ('Loss = {}, Accuracy = {}'.format(loss, accuracy))
        return history, loss, accuracy
        
    history = model.fit(train_x, train_y, epochs=epochs, 
                         validation_data=(val_x, val_y), 
                         workers=4, shuffle=True, callbacks=callbacks_list)
    
    loss, accuracy = model.evaluate(test_x, test_y, verbose=0)
    print ('Loss = {}, Accuracy = {}'.format(loss, accuracy))
    model.save(model_path)
    print('Saved trained model at %s ' % model_path)
    save_history(hist_path, history.history)
    return history.history, loss, accuracy


  assert (os.path.exists(model_path), 'Model must be saved at model_path')


In [14]:
history_list = [0] * 6
training_data = [0] * 6

for i in range (3):
    history, loss, accuracy = train_and_evaluate(i)
    history_list[i] = history
    data_dict = {'test_accuracy':accuracy, 
                 'train_accuracy':history['acc'][-1],
                 'val_accuracy':history['val_acc'][-1]}
    training_data[i] = data_dict

ValueError: Data cardinality is ambiguous:
  x sizes: 5000
  y sizes: 25000
Please provide data which shares the same first dimension.