# Deep Learning For NLP Using Pretrained Word Vectors

In [6]:
from keras.models import Sequential
from keras.datasets import imdb
from keras import preprocessing
from keras.layers import Flatten, Dense
from keras.layers import Embedding
import numpy as np

In [2]:
#download imdb dataset from keras
max_features = 10000
maxlen = 20

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen = maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen = maxlen)

In [3]:
model = Sequential()
model.add(Embedding(10000, 8, input_length = maxlen))
model.add(Flatten()) 
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics = ['acc'])
model.summary()

history = model.fit(x_train, y_train, epochs = 10, batch_size = 32, validation_split = .2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Processing Labels of Raw Text Data for IMDB Dataset
- downloaded the zip file from http://mng.bz/0tIo
- unzip files and then label the text based on neg or pos review
- this will show how using pretrained word embeddings can greatly aid in nlp tasks that have only a small amount of training data (even though this isn't the case here, using more samples than 200 would help)

In [20]:
import os
imbdbdir = 'c:\\Users\\Sam Cannon\\Desktop\\Python\\Deep Learning With Python'
train_dir = os.path.join(imbdbdir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding = 'utf8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

## Tokenizing Text

In [21]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [22]:
#use only 200 reviews, using pretrained word embeddings
maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print(f'Shape of data tensor: {data.shape}')
print(f'Shape of label tensor: {labels.shape}')

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 88582 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


## Using GloVe and FastText Word Embeddings
- download wikipedia 2014 embeddings from https://nlp.stanford.edu/projectz/glove

In [15]:
# parsing fasttext word-embeddings (1,000,000 vectors) 
fasttext_dir = 'c:\\Users\\Sam Cannon\\Desktop\\Python\\fasttext'

fasttext_index = {}
f = open(os.path.join(fasttext_dir, 'wiki-news-300d-1M.vec'), encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    fasttext_index[word] = coefs
f.close()

print(f'Found {len(fasttext_index)} word vectors.')

Found 999995 word vectors.


In [11]:
#parsing GlOve word vectors
glove_dir = 'c:\\Users\\Sam Cannon\\Desktop\\Python\\Deep Learning With Python\\GLOVE'

glove_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding = 'utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(glove_index))

Found 400000 word vectors.


In [23]:
embedding_dim = 300

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = fasttext_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [24]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length = maxlen))
model.add(Flatten())
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          3000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 30000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                960032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 3,960,065
Trainable params: 3,960,065
Non-trainable params: 0
_________________________________________________________________


In [25]:
#now load the GlOve matrix we prepared above into the Embedding layer, the first layer in the model
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False #freeze the Embedding layer, the pretrained word embeddings shouldn't be updated during training

In [26]:
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics=['acc'])

history = model.fit(x_train, y_train, epochs = 10, batch_size=32, validation_data=(x_val, y_val))

model.save_weights('pre_trained_glove_model.h5')

Train on 200 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
#evaluate on test set, first we must unpack and label all the test text
test_dir = os.path.join('c:\\Users\\Sam Cannon\\Desktop\\Python\\Deep Learning With Python', 'test')

labels = []
texts = []
 
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding = 'utf8')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen = maxlen)
y_test = np.asarray(labels)

In [28]:
#now load and evaluate the model on the test set -- terrible accuracy, this is becuase we are only looking at 200 samples!
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test) 



[0.7882445617603828, 0.5636526942253113]