In [0]:
import numpy as np

In [0]:
from keras.datasets import imdb
from keras import preprocessing
max_features = 10000
maxlen = 200
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) 

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [0]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
model = Sequential()
model.add(Embedding(10000, 32, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           320000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 6400)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6401      
Total params: 326,401
Trainable params: 326,401
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2020-04-27 13:32:11--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2020-04-27 13:32:12 (61.5 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



In [0]:
!tar --gunzip --extract --file=aclImdb_v1.tar.gz

In [0]:
import os
imdb_dir = '/content/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 300
max_words = 10000 
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

nb_validation_samples = int(0.1 * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
#x_train = data[:training_samples]
#y_train = labels[:training_samples]
#x_val = data[training_samples: training_samples + validation_samples]
#y_val = labels[training_samples: training_samples + validation_samples]

Using TensorFlow backend.


Found 88582 unique tokens.
Shape of data tensor: (25000, 300)
Shape of label tensor: (25000,)


In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-04-27 06:09:07--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-04-27 06:09:07--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-04-27 06:09:07--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [0]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
glove_dir = '/content'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [0]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [0]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
model_embed = Sequential()
model_embed.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model_embed.add(Flatten())
model_embed.add(Dense(32, activation='relu'))
model_embed.add(Dense(1, activation='sigmoid'))
model_embed.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 300, 100)          1000000   
_________________________________________________________________
flatten_8 (Flatten)          (None, 30000)             0         
_________________________________________________________________
dense_15 (Dense)             (None, 32)                960032    
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 33        
Total params: 1,960,065
Trainable params: 1,960,065
Non-trainable params: 0
_________________________________________________________________


In [0]:
model_embed.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history_embed = model_embed.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
model_embed.save_weights('embed_model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 22500 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
model_embed_pretrain = Sequential()
model_embed_pretrain.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model_embed_pretrain.add(Flatten())
model_embed_pretrain.add(Dense(32, activation='relu'))
model_embed_pretrain.add(Dense(1, activation='sigmoid'))
model_embed_pretrain.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 300, 100)          1000000   
_________________________________________________________________
flatten_9 (Flatten)          (None, 30000)             0         
_________________________________________________________________
dense_17 (Dense)             (None, 32)                960032    
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 33        
Total params: 1,960,065
Trainable params: 1,960,065
Non-trainable params: 0
_________________________________________________________________


In [0]:
model_embed_pretrain.layers[0].set_weights([embedding_matrix])
model_embed_pretrain.layers[0].trainable = False

In [0]:
model_embed_pretrain.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history_embed_pretrain = model_embed_pretrain.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
model_embed.save_weights('embed_pretrain_model.h5')

Train on 22500 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
'''import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()'''

In [0]:
test_dir = os.path.join(imdb_dir, 'test')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels)

In [0]:
#model_embed.load_weights('embed_model.h5')
model_embed.evaluate(x_test, y_test)



[1.3356188513851166, 0.8370800018310547]

In [0]:
#model_embed_pretrain.load_weights('embed_pretrain_model.h5')
model_embed_pretrain.evaluate(x_test, y_test)



[1.4055371337699891, 0.7221599817276001]

In [0]:
'''from keras.datasets import imdb
from keras.preprocessing import sequence
max_features = 10000
maxlen = 500
batch_size = 32
print('Loading data...')
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')
print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)'''

In [0]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
model_simple_rnn = Sequential()
model_simple_rnn.add(Embedding(10000, 32))
model_simple_rnn.add(SimpleRNN(32))
model_simple_rnn.add(Dense(1, activation='sigmoid'))
model_simple_rnn.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history_simple_rnn = model_simple_rnn.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
model_simple_rnn.save_weights('simple_rnn_model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 22500 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
#model_simple_rnn.load_weights('simple_rnn_model.h5')
model_simple_rnn.evaluate(x_test, y_test)



[0.4910227665400505, 0.8416399955749512]

In [0]:
'''import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()'''

In [0]:
from keras.layers import LSTM
model_lstm = Sequential()
model_lstm.add(Embedding(10000, 32))
model_lstm.add(LSTM(32))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history_lstm = model_lstm.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))
model_lstm.save_weights('lstm_model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 22500 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
#model_lstm.load_weights('lstm_model.h5')
model_lstm.evaluate(x_test, y_test)



[0.381427589725256, 0.8727999925613403]

In [0]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb


max_features = 10000
maxlen = 300

model_bilstm = Sequential()
model_bilstm.add(Embedding(max_features, 32))
model_bilstm.add(Bidirectional(LSTM(32)))
#model_bilstm.add(Dropout(0.5))
model_bilstm.add(Dense(1, activation='sigmoid'))

model_bilstm.summary()

model_bilstm.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

history_bilstm = model_bilstm.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_val, y_val))
model_bilstm.save_weights('bilstm_model.h5')

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 300, 32)           320000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                16640     
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 65        
Total params: 336,705
Trainable params: 336,705
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 22500 samples, validate on 2500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [0]:
#model_bilstm.load_weights('bilstm_model.h5')
model_bilstm.evaluate(x_test, y_test)



[0.5186398750025034, 0.8530399799346924]

In [0]:
model_bilstm_pretrain = Sequential()
#model_bilstm_pretrain.add(Embedding(10000, 100, input_length=300))
model_bilstm_pretrain.add(Embedding(input_dim = embedding_matrix.shape[0], output_dim = embedding_matrix.shape[1], input_length = 300, weights = [embedding_matrix], trainable=False))
model_bilstm_pretrain.add(Bidirectional(LSTM(50)))
#model_bilstm_pretrain.add(Dropout(0.25))
#model_bilstm_pretrain.add(Dense(64))
#model_bilstm_pretrain.add(Dropout(0.5))
model_bilstm_pretrain.add(Dense(1, activation='sigmoid'))

#model_bilstm_pretrain.layers[0].set_weights([embedding_matrix])
#model_bilstm_pretrain.layers[0].trainable = False
model_bilstm_pretrain.summary()

model_bilstm_pretrain.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

history_bilstm_pretrain = model_bilstm_pretrain.fit(x_train, y_train, epochs=10, batch_size=128, validation_data=(x_val, y_val))
model_bilstm_pretrain.save_weights('bilstm_pretrain_model.h5')

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 300, 100)          1000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100)               60400     
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 101       
Total params: 1,060,501
Trainable params: 60,501
Non-trainable params: 1,000,000
_________________________________________________________________
Train on 22500 samples, validate on 2500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
#model_bilstm_pretrain.load_weights('bilstm_pretrain_model.h5')
model_bilstm_pretrain.evaluate(x_test, y_test)



[0.30916453993558884, 0.8671200275421143]

In [0]:
model_bilstm_pretrain_tuned = Sequential()
model_bilstm_pretrain_tuned.add(Embedding(input_dim = embedding_matrix.shape[0], output_dim = embedding_matrix.shape[1], input_length = 300, weights = [embedding_matrix], trainable=False))
model_bilstm_pretrain_tuned.add(Bidirectional(LSTM(50, recurrent_dropout=0.1)))
model_bilstm_pretrain_tuned.add(Dropout(0.25))
model_bilstm_pretrain_tuned.add(Dense(64))
model_bilstm_pretrain_tuned.add(Dropout(0.5))
model_bilstm_pretrain_tuned.add(Dense(1, activation='sigmoid'))

model_bilstm_pretrain_tuned.summary()

model_bilstm_pretrain_tuned.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

history_bilstm_pretrain_tuned = model_bilstm_pretrain_tuned.fit(x_train, y_train, epochs=10, batch_size=128, validation_data=(x_val, y_val))
model_bilstm_pretrain_tuned.save_weights('bilstm_pretrain_tuned_model.h5')

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 300, 100)          1000000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100)               60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 64)                6464      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 65        
Total params: 1,066,929
Trainable params: 66,929
Non-trainable params: 1,000,000
______________________________________

In [0]:
#model_bilstm_pretrain_tuned.load_weights('bilstm_pretrain_tuned_model.h5')
model_bilstm_pretrain_tuned.evaluate(x_test, y_test)



[0.31574368715286255, 0.8610000014305115]