## IMDB-movie-review-sentiment-prediction from scratch (generating word embeddings and cleaning data)

In [2]:
import numpy as np
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1

In [3]:
token_index

{'The': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'the': 5,
 'mat.': 6,
 'dog': 7,
 'ate': 8,
 'my': 9,
 'homework.': 10}

In [4]:
#this stands for the max no. of words in a sentence
max_length = 10

results = np.zeros(shape = (len(samples), max_length, 1 + max(token_index.values())))

In [5]:
results.shape

(2, 10, 11)

In [6]:
for i, sample in enumerate(samples):
    for word_no, word in list(enumerate(sample.split())):
        results[i, word_no, token_index.get(word)] = 1

In [7]:
results

array([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0

In [8]:
import numpy as np
import string
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable

token_index = dict(zip(characters,  range(1, len(characters) + 1)))

In [9]:
max_length = 50

z_coord = 1 + max(token_index.values())

results2 = np.zeros(( len(samples) , max_length, z_coord))


In [10]:
for i, sample in enumerate(samples):
    for j, character in enumerate(sample):
        index = token_index.get(character)
        results2[i, j, index] = 1.

In [11]:
temp = results2[0]

In [12]:
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)

Using TensorFlow backend.


In [13]:
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [14]:
sam = ['The cat biscuits ate']
print(tokenizer.texts_to_sequences(sam))

[[1, 2, 7]]


In [15]:
tokenizer.word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework': 9}

In [16]:
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

In [17]:
one_hot_results[1][0:10]

array([0., 1., 0., 0., 0., 0., 1., 1., 1., 1.])

In [18]:
word_index = tokenizer.word_index

In [19]:
print('Found {} unique tokens'.format(len(word_index)))

Found 9 unique tokens


In [20]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

dimensionality = 1000
max_length = 10

results = np.zeros((len(samples), max_length, dimensionality))

In [21]:
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split())):
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.
        

In [22]:
results[1].argmax(axis = 1)

array([421, 141,  80, 870, 601,   0,   0,   0,   0,   0])

In [23]:
from keras.layers import Embedding

embedding_layer = Embedding(1000, 64)

In [24]:
from keras.datasets import imdb
from keras import preprocessing

In [25]:
max_features = 10000
maxlen = 20

In [26]:
(x_train, y_train), (x_test, y_test) = imdb.load_data( num_words=max_features)

In [27]:
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
                                               

In [28]:
x_train.shape

(25000, 20)

In [29]:
x_test.shape

(25000, 20)

In [30]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [31]:
x_train[0]

array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
        113,  103,   32,   15,   16, 5345,   19,  178,   32], dtype=int32)

In [32]:
history = model.fit(x_train, y_train, epochs=10,
batch_size=32, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
scores = model.evaluate(x_test, y_test)



In [34]:
accuracy_percentage = scores[1]*100
print(accuracy_percentage)

75.66400170326233


In [35]:
x_train.shape

(25000, 20)

In [44]:
#TRYING OUT ON A TOY DATA SET TO GET A HANG OF THE PROBLEM

In [36]:
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])

In [37]:
labels

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [38]:
import keras

In [39]:
vocab_size = 50
encoded_docs = [keras.preprocessing.text.one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

[[46, 47], [16, 34], [29, 28], [37, 34], [47], [26], [32, 28], [16, 16], [32, 34], [26, 2, 47, 2]]


In [40]:
max_length = 4
padded_docs = preprocessing.sequence.pad_sequences(encoded_docs, maxlen=max_length, padding = 'post')
print(padded_docs)

[[46 47  0  0]
 [16 34  0  0]
 [29 28  0  0]
 [37 34  0  0]
 [47  0  0  0]
 [26  0  0  0]
 [32 28  0  0]
 [16 16  0  0]
 [32 34  0  0]
 [26  2 47  2]]


In [41]:
model = Sequential()
model.add(Embedding(50, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation  = 'sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_2 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


In [42]:
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Accuracy: 89.999998
