In [0]:
import sklearn
import numpy as np

In [0]:
from keras.datasets import imdb
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)


In [0]:
dataset = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

In [29]:
print("Categories:", np.unique(targets))

Categories: [0 1]


In [30]:
print("Number of unique words:", len(np.unique(np.hstack(dataset))))

Number of unique words: 9998


In [0]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()]) 
# decoded1 = " ".join( [reverse_index.get(i - 3, "#") for i in dataset[0]] )
# decoded2 = " ".join( [reverse_index.get(i - 3, "#") for i in dataset[1]] )
# corpus = [decoded1, decoded2]

word2Vec = []

for n in range(0, len(dataset)):
  decoded = " ".join( [reverse_index.get(i - 3, "#") for i in dataset[n]] )
  word2Vec.append(decoded)

In [32]:
len(word2Vec)

50000

In [33]:
len(dataset)

50000

## Generating Feature Vector using CountVectorizer

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X= vectorizer.fit_transform(word2Vec)
CV = X.toarray() # This is the CountVector 


In [35]:
CV.shape

(50000, 9771)

## Running CV feature Vector through Neural Network

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

input_dim = CV.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [0]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
epoch =25
batch = 10000

X_TEST, Y_TEST = CV[:10000], targets[:10000]
X_VALID, Y_VALID = CV[10000:20000], targets[10000:20000]
X_TRAIN, Y_TRAIN = CV[20000:], targets[20000:]
#X_VALID, Y_VALID = CV[:batch], targets[:batch]
#X_TRAIN, Y_TRAIN = CV[batch:], targets[batch:]



In [39]:
model.fit(X_TRAIN, Y_TRAIN, validation_data=(X_VALID, Y_VALID), batch_size = batch, epochs=epoch)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f79490e6208>

In [40]:
score = model.evaluate(X_TEST, Y_TEST, verbose=0)
print("Test Acc: ", score[1])

Test Acc:  0.8787999749183655


## Verifying review with predicted output


In [41]:
print(model.predict(X_TEST[0:1]))
print(word2Vec[0])

[[0.98156595]]
# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have don

## Generating Feature Vector using TfidfVectorizor


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer1 = TfidfVectorizer()
X1 = vectorizer1.fit_transform(word2Vec)
Tfidf = X1.toarray()


In [43]:
print(Tfidf)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [0]:


input_dim = Tfidf.shape[1]  # Number of features

model_Tfidf = Sequential()
model_Tfidf.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model_Tfidf.add(layers.Dense(1, activation='sigmoid'))

## Running Tfidf feature Vector through Neural Network

In [0]:
epoch =25
batch = 10000

X_TEST, Y_TEST = Tfidf[:10000], targets[:10000]
X_VALID, Y_VALID = Tfidf[10000:20000], targets[10000:20000]
X_TRAIN, Y_TRAIN = Tfidf[20000:], targets[20000:]

In [0]:
model_Tfidf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [47]:
model_Tfidf.fit(X_TRAIN, Y_TRAIN, validation_data=(X_VALID, Y_VALID), batch_size = batch, epochs=epoch)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f7949423b38>

In [48]:
score = model_Tfidf.evaluate(X_TEST, Y_TEST, verbose=0)
print("Test Acc: ", score[1])

Test Acc:  0.8391000032424927


## Verifying predicted output with review


### Creating Feature Fector and Vocab table using GENSIM

In [49]:
print(model_Tfidf.predict(X_TEST[0:1]))
print(word2Vec[0])

[[0.64997876]]
# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have don

In [0]:
import gensim

In [0]:
word_data = []
for index in range(50000):
    decoded = " ".join( [reverse_index.get(i - 3, "#") for i in dataset[index]] )
    # Tokenize, lowercase & return word list
    word_data.append(gensim.utils.simple_preprocess(decoded))

In [52]:
print(word_data[0])

['this', 'film', 'was', 'just', 'brilliant', 'casting', 'location', 'scenery', 'story', 'direction', 'everyone', 'really', 'suited', 'the', 'part', 'they', 'played', 'and', 'you', 'could', 'just', 'imagine', 'being', 'there', 'robert', 'is', 'an', 'amazing', 'actor', 'and', 'now', 'the', 'same', 'being', 'director', 'father', 'came', 'from', 'the', 'same', 'scottish', 'island', 'as', 'myself', 'so', 'loved', 'the', 'fact', 'there', 'was', 'real', 'connection', 'with', 'this', 'film', 'the', 'witty', 'remarks', 'throughout', 'the', 'film', 'were', 'great', 'it', 'was', 'just', 'brilliant', 'so', 'much', 'that', 'bought', 'the', 'film', 'as', 'soon', 'as', 'it', 'was', 'released', 'for', 'and', 'would', 'recommend', 'it', 'to', 'everyone', 'to', 'watch', 'and', 'the', 'fly', 'fishing', 'was', 'amazing', 'really', 'cried', 'at', 'the', 'end', 'it', 'was', 'so', 'sad', 'and', 'you', 'know', 'what', 'they', 'say', 'if', 'you', 'cry', 'at', 'film', 'it', 'must', 'have', 'been', 'good', 'and'

In [53]:
model = gensim.models.Word2Vec (word_data, size=150, window=10, min_count=2, workers=4)
model.train(word_data,total_examples=len(dataset),epochs=10)
# size : dense vector to represent each word
# window: The maximum distance between the target word and its neighboring word.
# min_count: Minimium frequency count of words
# workers: Thread count to process

(77645370, 104823320)

In [54]:
model.wv['funny']

array([ 1.5404786 ,  4.171441  ,  0.83684874, -0.5592156 ,  3.4315958 ,
       -0.78910124,  3.0296912 , -0.7480217 ,  1.0430611 , -2.3906152 ,
        0.6123959 ,  2.6702044 ,  2.5711122 , -1.7501968 ,  0.5347169 ,
        0.10103285,  0.70169175,  0.70712966, -1.5294005 ,  0.8009657 ,
        0.05802038,  0.31258148, -0.49763247,  0.83170223, -3.285022  ,
        3.0567937 , -3.1388369 ,  1.0510155 , -0.24555437,  4.187261  ,
        1.8701437 , -0.11929373,  1.4839939 ,  0.59966034, -1.7216042 ,
       -2.0753202 , -1.0238128 ,  0.32974148,  4.0418167 , -1.3936002 ,
        2.8693686 ,  1.4628925 ,  0.12152538, -1.2313225 ,  5.3812456 ,
       -0.92557365,  0.57946503,  4.5781355 ,  0.9848257 ,  1.8548976 ,
       -1.5060169 ,  3.2495177 ,  1.8042606 , -1.7090688 ,  1.3695446 ,
       -0.82123613, -2.1511726 ,  1.0209887 , -0.7708824 ,  2.092686  ,
        4.394701  ,  0.5716976 , -2.1374388 , -0.4576588 , -2.9813273 ,
        0.11645153, -0.25342986, -3.1303043 , -0.6599845 , -2.31

In [55]:
w1 = "stupid"
model.wv.most_similar (positive=w1)

  if np.issubdtype(vec.dtype, np.int):


[('dumb', 0.8327645063400269),
 ('ridiculous', 0.712376058101654),
 ('lame', 0.7091778516769409),
 ('pathetic', 0.6964638829231262),
 ('moronic', 0.6614812612533569),
 ('silly', 0.6403506994247437),
 ('idiotic', 0.6378515362739563),
 ('bad', 0.6326823830604553),
 ('horrible', 0.6223008036613464),
 ('unrealistic', 0.6211845278739929)]

In [0]:
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
sentenceLabels = targets[:50000]

LabeledSentences = []
for index, sentence in enumerate(word_data):
  #LabeledSentences.append([word_data[index], [sentenceLabels[index]]])
  LabeledSentences.append(TaggedDocument(sentence, [sentenceLabels[index]]))

In [57]:
LabeledSentences[1]

TaggedDocument(words=['big', 'hair', 'big', 'boobs', 'bad', 'music', 'and', 'giant', 'safety', 'pin', 'these', 'are', 'the', 'words', 'to', 'best', 'describe', 'this', 'terrible', 'movie', 'love', 'cheesy', 'horror', 'movies', 'and', 've', 'seen', 'hundreds', 'but', 'this', 'had', 'got', 'to', 'be', 'on', 'of', 'the', 'worst', 'ever', 'made', 'the', 'plot', 'is', 'paper', 'thin', 'and', 'ridiculous', 'the', 'acting', 'is', 'an', 'abomination', 'the', 'script', 'is', 'completely', 'laughable', 'the', 'best', 'is', 'the', 'end', 'showdown', 'with', 'the', 'cop', 'and', 'how', 'he', 'worked', 'out', 'who', 'the', 'killer', 'is', 'it', 'just', 'so', 'damn', 'terribly', 'written', 'the', 'clothes', 'are', 'sickening', 'and', 'funny', 'in', 'equal', 'the', 'hair', 'is', 'big', 'lots', 'of', 'boobs', 'men', 'wear', 'those', 'cut', 'shirts', 'that', 'show', 'off', 'their', 'sickening', 'that', 'men', 'actually', 'wore', 'them', 'and', 'the', 'music', 'is', 'just', 'trash', 'that', 'plays', 'ov

## Build vocab

In [58]:
from tqdm import tqdm
from gensim.models import Doc2Vec
from gensim import utils

model_Gensim = Doc2Vec(vector_size = 9771, min_count=1, window=10, workers = 7)
model_Gensim.build_vocab([x for x in tqdm(LabeledSentences)])

100%|██████████| 50000/50000 [00:00<00:00, 2291017.94it/s]


In [0]:
model_Gensim.train(LabeledSentences, total_examples=len(LabeledSentences), epochs=3)

## Using a function to create feature vector

In [0]:
def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, feature_vectors





## Create NN using GENSIM Neural Network

!!! I used significantly less datasamples in order to speed up the train time. I understand this may reduce accuracy. 

In [61]:
Y_TEST, X_TEST = vector_for_learning(model_Gensim, LabeledSentences[:1000])
print("Test Samples Done")
Y_VALID, X_VALID = vector_for_learning(model_Gensim, LabeledSentences[1000:2000])
print("Validation Samples Done")
Y_TRAIN, X_TRAIN = vector_for_learning(model_Gensim, LabeledSentences[2000:5000])
print("Training Samples Done")



Test Samples Done
Validation Samples Done
Training Samples Done


In [0]:


input_dim = 9771  # Number of features

model_gen = Sequential()
model_gen.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model_gen.add(layers.Dense(1, activation='sigmoid'))



In [0]:
epoch =25
batch = 10000
model_gen.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
model_gen.fit(np.array(X_TRAIN), np.array(Y_TRAIN), validation_data=(np.array(X_VALID), np.array(Y_VALID)), batch_size = batch, epochs=epoch)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f79079ed240>

In [65]:
score = model_gen.evaluate(np.array(X_TEST), np.array(Y_TEST), verbose=0)
print("Test Acc: ", score[1])

Test Acc:  0.8650000095367432


## verifying predicted output with review

In [70]:
model_gen.predict(np.array(X_TEST[0:1]))
print(word2Vec[0])

# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you thi

## Summary
- CountVectorizer Accuracy: 0.877
- TfidfVectorizer Accuracy: 0.841
- GENSIM Accuracy: 0.843

I believe the correct order of efficient and accuracy should be in the order of 
1. GENSIM
2. TFIDVectorizer
3. CountVectorizer

GENSIM should be the best because it keeps track of a single word and it's relationship/similarity to other works, giving it 'weights'

TFIDFvectorizer should be the second best because it keeps track of the number of time words have been used in the sentence.

CountVectorizer should be last as it simply just finds if a word has been present, not keeping track of the number of times it appears.

my results do not reflect, but I believe since they use the same neural networks, that the dataset I am using could consist of longer sentences and vocab variety. GENSIM however, was negatively impacted because I intentionally reduce the number of training samples to speed up the training. I believe if I tried on the entire dataset, it would produce the best results. 


## Reference
I used this tutorial in order to create the GENSIM model. 

https://towardsdatascience.com/implementing-multi-class-text-classification-with-doc2vec-df7c3812824d
