<a href="https://colab.research.google.com/github/sattviksahai/CS583B_Recitation/blob/master/RNN_IMDB_reviews_reference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Extract data

In [0]:
!tar -xf 'drive/My Drive/aclImdb_v1.tar.gz'
! rm -rf aclImdb/train/unsup/
!ls

Read Training Data

In [0]:
import os
imdb_dir = './aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels_train = []
texts_train = []
for label_type in ['pos', 'neg']:
  dir_name = os.path.join(train_dir, label_type)
  for fname in os.listdir(dir_name):
    if fname[-4:] == '.txt':
      f = open(os.path.join(dir_name, fname))
      texts_train.append(f.read())
      f.close()
      if label_type == 'neg':
        labels_train.append(0)
      else:
        labels_train.append(1)

In [0]:
print('Number of training samples: '+str(len(texts_train)))
print('Number of training labels: '+str(len(labels_train)))

Display random review with label

In [0]:
import numpy as np

i=np.random.randint(len(labels_train))
print('label #'+str(i)+': '+str(labels_train[i]))
print('text #'+str(i)+':')
print(texts_train[i])

Tokenize the text

In [6]:
from keras.preprocessing.text import Tokenizer

vocabulary = 10000
tokenizer = Tokenizer(num_words=vocabulary)
tokenizer.fit_on_texts(texts_train)

word_index = tokenizer.word_index
sequences_train = tokenizer.texts_to_sequences(texts_train)

Using TensorFlow backend.


Add Padding

In [0]:
from keras import preprocessing

word_num = 20
x_train_val = preprocessing.sequence.pad_sequences(sequences_train, maxlen=word_num)

In [8]:
print(x_train_val.shape)

(25000, 20)


Training Validation split

In [9]:
split_ratio = 0.9

indices = np.arange(x_train_val.shape[0])
np.random.shuffle(indices)

x_train = x_train_val[indices[:int(len(indices)*split_ratio)]]
x_val = x_train_val[indices[int(len(indices)*split_ratio):]]

labels_train = np.array(labels_train)
y_train = labels_train[indices[:int(len(indices)*split_ratio)]]
y_val = labels_train[indices[int(len(indices)*split_ratio):]]
print('Shape of training data: ', x_train.shape)
print('Shape of training labels: ', y_train.shape)
print('Shape of validation data: ', x_val.shape)
print('Shape of validation labels: ', y_val.shape)

Shape of training data:  (22500, 20)
Shape of training labels:  (22500,)
Shape of validation data:  (2500, 20)
Shape of validation labels:  (2500,)


Define Model
Options:
a) Simple RNN
b) LSTM
c) Stacked LSTM
d) Bidirectional LSTM
e) Conv1D

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Flatten, LSTM, Bidirectional, Conv1D

embedding_dim = 32
h_dim = 32

model = Sequential()
model.add(Embedding(vocabulary, embedding_dim, input_length=word_num))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            320000    
_________________________________________________________________
conv1d (Conv1D)              (None, 16, 32)            5152      
_________________________________________________________________
dense (Dense)                (None, 16, 1)             33        
Total params: 325,185
Trainable params: 325,185
Non-trainable params: 0
_________________________________________________________________


Define Optimizer

In [0]:
from tensorflow.keras import optimizers

epochs = 50

model.compile(optimizer=optimizers.RMSprop(lr=0.01),
            loss='binary_crossentropy', metrics=['acc'])

Train

In [0]:
history = model.fit(x_train, y_train, epochs=epochs,
                    batch_size=16, validation_data=(x_val, y_val))

Visualize training

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

epochs = range(epochs) # 50 is the number of epochs
train_acc = history.history['acc']
valid_acc = history.history['val_acc']
print(train_acc)
plt.plot(epochs, train_acc, 'bo', label='Training Accuracy')
plt.plot(epochs, valid_acc, 'r', label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [0]:
model.save('imdb_model.h5')

Read Test data

In [0]:
import os
imdb_dir = './aclImdb'
test_dir = os.path.join(imdb_dir, 'test')
labels_test = []
texts_test = []
for label_type in ['pos', 'neg']:
  dir_name = os.path.join(test_dir, label_type)
  for fname in os.listdir(dir_name):
    if fname[-4:] == '.txt':
      f = open(os.path.join(dir_name, fname))
      texts_test.append(f.read())
      f.close()
      if label_type == 'neg':
        labels_test.append(0)
      else:
        labels_test.append(1)

In [0]:
print('Number of training samples: '+str(len(texts_test)))
print('Number of training labels: '+str(len(labels_test)))

Tokenize the text

In [0]:
sequences_test = tokenizer.texts_to_sequences(texts_test)

Padding

In [0]:
x_test = preprocessing.sequence.pad_sequences(sequences_test, maxlen=word_num)

Evaluate model

In [0]:
loss_and_acc = model.evaluate(x_test, np.array(labels_test))
print("loss = ",str(loss_and_acc[0]))
print("acc = ", str(loss_and_acc[1]))