In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
samples = [
    "the cat sat on the mat",
    "the dog ate my homework"
]

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
tokenizer.word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework': 9}

In [8]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras import preprocessing

max_features = 10000
max_length = 20

(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words=max_features)

X_train = preprocessing.sequence.pad_sequences(X_train, maxlen=max_length)
X_test = preprocessing.sequence.pad_sequences(X_test, maxlen=max_length)


WORD EMBEDDINGS
one hot vectors are wasteful of space, and can not scale to very large differences in values. 
word embeddings are associating every word with a float vector. 
This allows us to represent a very large variety of words with much less space (quasi logarithmic)
A good word embedding should show structure shown in the words and their relationships.

Example: 
If (dog, wolf, book, notes) are in the dataset, their vectors in n-D space should be representing similarity.
dog and wolf should have less euclidean distance than suppose say dog and book. 
similarly book and notes should have less distance. 


Initially, the Embeddings are assigned randomly, and with training process, we use backpropagation to get better embeddings. 
If we are working on a document task, and we can get a good word embeddings trained on millions of sentences, we should use that. 


In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding

model = Sequential()

model.add(Embedding(max_features, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['acc']
)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [10]:
history = model.fit(X_train, Y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


This can easily be increased upto around 90% accuracy by just increasing max_length = 200 words instead. 
But since this network has only one Dense neuron, 
it is essentially weighted sum of everything before. 
sum(x(i) * w(i)) does not take into account relationships between words

In [17]:
from os.path import join as pjoin

raw = "aclImdb"
base_dir = pjoin(os.getcwd(), raw)
train_dir = pjoin(base_dir, 'train')
test_dir = pjoin(base_dir, 'test')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dirname = pjoin(train_dir, label_type)
    for filename in os.listdir(dirname):
        if filename[-4:] == '.txt':
            contents = open(pjoin(dirname, filename),encoding='utf-8').read()
            texts.append(contents)
            labels.append(1 if label_type == 'pos' else 0)

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np 

maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000


tokenizer = Tokenizer(num_words=max_words)
# assign number to each word
tokenizer.fit_on_texts(texts)
# convert every sentence into an array of numbers
sequences = tokenizer.texts_to_sequences(texts)
print(f"Found {len(tokenizer.word_index)} words")

Found 88582 words


In [24]:
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
data.shape, labels.shape

((25000, 100), (25000,))

In [39]:
# We will train on only 200 samples, and use a pretrained word embedding to get good performance
indices = np.arange(data.shape[0])
np.random.shuffle(indices)

data = data[indices]
labels = labels[indices]

X_train = data[:training_samples]
Y_train = labels[:training_samples]

X_val = data[training_samples: training_samples + validation_samples]
Y_val = labels[training_samples: training_samples + validation_samples]

In [40]:
# Download the pretrained word embedding -> https://nlp.stanford.edu/projects/glove/
glove_dir = pjoin(os.getcwd(), 'glove.6B')
embeddings_index = {}

with open(pjoin(glove_dir, "glove.6B.100d.txt"), encoding='utf-8') as reader:
    for line in reader:
        values = line.split()
        word = values[0]
        coefficients = values[1:]
        embeddings_index[word] = coefficients

print(f"Found {len(embeddings_index)} word vectors")


Found 400000 word vectors


In [41]:
word_index = tokenizer.word_index
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [42]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()


Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_4 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [44]:
model.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['acc']
)

In [45]:
history = model.fit(
    X_train, Y_train, 
    epochs=10, 
    batch_size=32, 
    validation_data=(X_val, Y_val)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
