In [2]:
#Read data files
import os
imdb_dir = 'aclImdb' #directory of data
train_dir = os.path.join(imdb_dir, 'train') #aclImdb/train
labels = []
texts = []
for label_type in ['neg', 'pos']:   #read positive and negative reviews separately
    dir_name = os.path.join(train_dir, label_type) 
    for fname in os.listdir(dir_name): #Get all files in the folder
        if fname[-4:] == '.txt':  #make sure file name ends with txt
            f = open(os.path.join(dir_name, fname), encoding="utf8")
            texts.append(f.read())  #append data read from file into text
            f.close()
            if label_type == 'neg':  #if this is a negative review
                labels.append(0)
            else: #if this is a positive review
                labels.append(1)

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [4]:
maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

In [5]:
tokenizer = Tokenizer(num_words=max_words) #make a tokenizer
tokenizer.fit_on_texts(texts)  #fit on text
sequences = tokenizer.texts_to_sequences(texts) #get integer values of words for sentence

In [14]:
sequences[0]

[62,
 4,
 3,
 129,
 34,
 44,
 7576,
 1414,
 15,
 3,
 4252,
 514,
 43,
 16,
 3,
 633,
 133,
 12,
 6,
 3,
 1301,
 459,
 4,
 1751,
 209,
 3,
 7693,
 308,
 6,
 676,
 80,
 32,
 2137,
 1110,
 3008,
 31,
 1,
 929,
 4,
 42,
 5120,
 469,
 9,
 2665,
 1751,
 1,
 223,
 55,
 16,
 54,
 828,
 1318,
 847,
 228,
 9,
 40,
 96,
 122,
 1484,
 57,
 145,
 36,
 1,
 996,
 141,
 27,
 676,
 122,
 1,
 411,
 59,
 94,
 2278,
 303,
 772,
 5,
 3,
 837,
 20,
 3,
 1755,
 646,
 42,
 125,
 71,
 22,
 235,
 101,
 16,
 46,
 49,
 624,
 31,
 702,
 84,
 702,
 378,
 3493,
 2,
 8422,
 67,
 27,
 107,
 3348]

In [28]:
word_index = tokenizer.word_index #dictionary of every word with its index
print(word_index)
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen) #pad to enusre every sentence is of same length
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 88582 unique tokens.


Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [26]:
indices = np.arange(data.shape[0])  #Create an array of indexes
np.random.shuffle(indices)  #shuffle the indexes
data = data[indices]  #index on data to get shuffle data
labels = labels[indices]

In [27]:
x_train = data[:training_samples] #separate training and validation data
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [17]:
glove_dir = 'glove.6B'   #the folder where glove embeddings are available
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'),encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]   #extract first word
    coefs = np.asarray(values[1:], dtype='float32') #Extract the vectors
    embeddings_index[word] = coefs   #Put word and vector in a dictionary
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [29]:
embedding_dim = 100 #we will have vectors of 100 dimension
embedding_matrix = np.zeros((max_words, embedding_dim))   #Create an array for embedding values
for word, i in word_index.items(): #For every word in our vocabulary
    if i < max_words:  #Make sure to consider only max_words
        embedding_vector = embeddings_index.get(word)  #Find the vector for the word
        if embedding_vector is not None:    
            embedding_matrix[i] = embedding_vector    #Put this vector into array

In [30]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
model = Sequential() #Create a model
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_3 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.layers[0].set_weights([embedding_matrix]) #Set embedding matrix
model.layers[0].trainable = False #Make sure it is not trained again

In [33]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_data=(x_val, y_val))

Train on 200 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
