# Machine Learning - Word2Vec

In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame()
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [4]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

review_lines = list()
lines = df['review'].values.tolist()

for line in lines:   
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word    
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words    
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)

In [5]:
len(review_lines)

50000

In [None]:
import gensim 

EMBEDDING_DIM = 100
# train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, workers=4, min_count=1)
# vocab size
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

In [30]:
# save model in ASCII (word2vec) format
filename = 'imdb_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [7]:
# let us try some utility functions of gensim word2vec more details here 

model.wv.most_similar('horrible')#, topn =1)

  if np.issubdtype(vec.dtype, np.int):


[('terrible', 0.915934145450592),
 ('awful', 0.8536903858184814),
 ('atrocious', 0.7735040783882141),
 ('horrendous', 0.753383457660675),
 ('pathetic', 0.7439737319946289),
 ('horrid', 0.7415372729301453),
 ('dreadful', 0.7396693825721741),
 ('sucks', 0.7365142107009888),
 ('lousy', 0.713822603225708),
 ('bad', 0.7013706564903259)]

In [8]:
#Let’s see the result of semantically reasonable word vectors (king - man + woman)
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('princess', 0.8994395136833191),
 ('romeo', 0.8883419632911682),
 ('juliet', 0.8835902214050293),
 ('carlotti', 0.8692097663879395),
 ('thrower', 0.8641826510429382),
 ('bride', 0.8634681105613708),
 ('yustory', 0.848070502281189),
 ('bourroughs', 0.8470751643180847),
 ('queen', 0.8430835604667664),
 ('nearbiographical', 0.8393160104751587)]

In [26]:
#Let’s see the result of semantically reasonable word vectors (king - man + woman)
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('romeo', 0.6537127494812012),
 ('princess', 0.60003662109375),
 ('gi', 0.5713154673576355),
 ('juliet', 0.5708507895469666),
 ('liann', 0.5608142018318176),
 ('queen', 0.557210385799408),
 ('bride', 0.5557085275650024),
 ('lion', 0.5508358478546143),
 ('rice', 0.5488076210021973),
 ('poe', 0.542876124382019)]

In [10]:
#odd word out
print(model.wv.doesnt_match("woman king queen movie".split()))

movie


In [11]:
model.wv.similar_by_word("cat")

[('dog', 0.8088006973266602),
 ('mouse', 0.7698529958724976),
 ('hat', 0.7433608770370483),
 ('mask', 0.7126131057739258),
 ('dude', 0.700873851776123),
 ('bugs', 0.696689248085022),
 ('monkey', 0.6927722096443176),
 ('clown', 0.6868656277656555),
 ('pet', 0.6774500608444214),
 ('snake', 0.6737427711486816)]

In [25]:
print(model.similarity('boy', 'girl'))

0.8420376


  """Entry point for launching an IPython kernel.


In [8]:
import os

embeddings_index = {}
f = open(os.path.join('', 'imdb_embedding_word2vec.txt'),  encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()


In [9]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [10]:
total_reviews = X_train + X_test
max_length = 100 # try other options like mean of sentence lengths

In [11]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

VALIDATION_SPLIT = 0.2

# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review_lines)
sequences = tokenizer_obj.texts_to_sequences(review_lines)

# pad sequences
word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length)
sentiment =  df['sentiment'].values
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)

# split the data into a training set and a validation set
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

Found 134156 unique tokens.
Shape of review tensor: (50000, 2678)
Shape of sentiment tensor: (50000,)


In [12]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)

Shape of X_train_pad tensor: (40000, 2678)
Shape of y_train tensor: (40000,)
Shape of X_test_pad tensor: (10000, 2678)
Shape of y_test tensor: (10000,)


In [13]:
EMBEDDING_DIM =100
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [14]:
print(num_words)

134157


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.initializers import Constant

# define model
model = Sequential()
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)

model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the model
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2678, 100)         13415700  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2674, 128)         64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1337, 128)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 171136)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 171137    
Total params: 13,650,965
Trainable params: 235,265
Non-trainable params: 13,415,700
_________________________________________________________________
None
Train on 40000 samples, validate on 10000 samples
Epoch 1/25


In [None]:
# evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test, batch_size=128)
print('Accuracy: %f' % (accuracy*100))

In [69]:
#Let us test some  samples
# load the dataset but only keep the top n words, zero the rest

test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "Good movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not to my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=MAX_SEQUENCE_LENGTH)

#predict
model.predict(x=test_samples_tokens_pad)

array([[0.9718868 ],
       [0.80231005],
       [0.61845976],
       [0.64929855],
       [0.6253854 ],
       [0.09913273],
       [0.80231005],
       [0.24591438]], dtype=float32)

In [15]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

# define model
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)
model.add(embedding_layer)
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Using TensorFlow backend.


Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2678, 100)         13415700  
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 13,428,501
Trainable params: 12,801
Non-trainable params: 13,415,700
_________________________________________________________________
None


In [None]:
print('Train...')

model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train...
Train on 40000 samples, validate on 10000 samples
Epoch 1/25
 - 75204s - loss: 0.5612 - acc: 0.6991 - val_loss: 0.3864 - val_acc: 0.8330
Epoch 2/25
 - 2529s - loss: 0.3911 - acc: 0.8276 - val_loss: 0.3187 - val_acc: 0.8653
Epoch 3/25


In [19]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 0.2625866099357605
Test accuracy: 0.892
Accuracy: 89.20%
