In [1]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.datasets import load_files

In [3]:
train_dir = r'data/S_Classification/aclImdb/train'
test_dir = r'data/S_Classification/aclImdb/test'

In [4]:
movies_train = load_files(train_dir, shuffle=False)
movies_test = load_files(test_dir, shuffle=False)

In [5]:
reviews_train = movies_train.data
reviews_test = movies_test.data

In [6]:
reviews_train_str = [str(i, encoding='utf') for i in reviews_train]
reviews_test_str = [str(j, encoding='utf') for j in reviews_test]

In [7]:
import re

In [8]:
replace_no_space = re.compile(r"[.;:!\'?,\"()\[\]]")
replace_with_space = re.compile(r"(<br\s*/><br\s*/>)|(\-)|(\/)")

In [9]:
def preprocess_reviews(reviews):
    reviews = [replace_no_space.sub("", line.lower()) for line in reviews]
    reviews = [replace_with_space.sub(" ", line) for line in reviews]
    
    return reviews

In [10]:
reviews_train_clean = preprocess_reviews(reviews_train_str)
reviews_test_clean = preprocess_reviews(reviews_test_str)

In [11]:
from nltk.corpus import stopwords

In [12]:
english_stop_words = stopwords.words('english')

In [13]:
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

In [14]:
no_stop_words_train = remove_stop_words(reviews_train_clean)
no_stop_words_test = remove_stop_words(reviews_test_clean)

In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [16]:
def get_lemmatized_text(corpus):
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

In [17]:
lemmatized_reviews_train = get_lemmatized_text(no_stop_words_train)
lemmatized_reviews_test = get_lemmatized_text(no_stop_words_test)

In [18]:
target_label = [0 if i < 12500 else 1 for i in range(25000)]

In [19]:
import gensim
from gensim.models import Word2Vec

In [20]:
lem_train = lemmatized_reviews_train.copy()

In [21]:
for i, sentence in enumerate(lem_train):
    tokenized_1 = []
    for word in sentence.split(' '):
        word = word.split(',')[0]
        tokenized_1.append(word)
    lem_train[i] = tokenized_1

In [22]:
word2vec_model = Word2Vec(lem_train, workers = 3, size = 50, min_count = 1, window = 3, sg = 1)

In [84]:
print(lem_train[0])

['story', 'man', 'unnatural', 'feeling', 'pig', 'start', 'opening', 'scene', 'terrific', 'example', 'absurd', 'comedy', 'formal', 'orchestra', 'audience', 'turned', 'insane', 'violent', 'mob', 'crazy', 'chanting', 'singer', 'unfortunately', 'stay', 'absurd', 'whole', 'time', 'general', 'narrative', 'eventually', 'making', 'putting', 'even', 'era', 'turned', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'third', 'grader', 'technical', 'level', 'better', 'might', 'think', 'good', 'cinematography', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'star', 'sally', 'kirkland', 'frederic', 'forrest', 'seen', 'briefly']


In [None]:
import pickle

In [None]:
w2v = pickle.dumps(word2vec_model)

In [None]:
import joblib

In [None]:
joblib.dump(word2vec_model, 'w2v_model')

In [34]:
wk_1_para = 'These tasks were achieved however there were issues with most of the group being unable to access the DIT central server through the Citrix Receiver software. Through our weekly hangout meeting issues which needed further discussing took place.'

In [39]:
wk_2_para = 'The necessary services required for the auditorium was discussed in greater detail and plant room location and size was agreed and incorporated within the design.'

In [40]:
wk_3_para = 'These tasks were achieved however there have been ongoing issues with access to the DIT central server through the Citrix Receiver software. The use of hangout meetings has been very useful in enabling group members to share ideas and make collective decisions.'

In [35]:
wk_1_para

'These tasks were achieved however there were issues with most of the group being unable to access the DIT central server through the Citrix Receiver software. Through our weekly hangout meeting issues which needed further discussing took place.'

In [41]:
wk_2_para

'The necessary services required for the auditorium was discussed in greater detail and plant room location and size was agreed and incorporated within the design.'

In [42]:
wk_3_para

'These tasks were achieved however there have been ongoing issues with access to the DIT central server through the Citrix Receiver software. The use of hangout meetings has been very useful in enabling group members to share ideas and make collective decisions.'

In [81]:
new_data = wk_1_para.split() + wk_2_para.split() + wk_3_para.split()

## Compare document similarity using Spacy API

In [30]:
import spacy

In [24]:
import en_core_web_sm

In [25]:
nlp = en_core_web_sm.load()

In [48]:
doc_1_spacy = nlp(wk_1_para)

In [50]:
doc_2_spacy = nlp(wk_2_para)

In [52]:
doc_3_spacy = nlp(wk_3_para)

In [55]:
doc_1_spacy.similarity(doc_2_spacy)

0.7236580392947892

In [56]:
doc_1_spacy.similarity(doc_3_spacy)

0.8996136023920993

In [57]:
doc_2_spacy.similarity(doc_3_spacy)

0.6779205836307043

## Using Gensim's pre-trained word embeddings from Text8 dataset

In [58]:
import gensim.downloader as api

In [61]:
dataset_text8 = api.load('text8')

In [63]:
data_text8 = [d for d in dataset_text8]

In [65]:
len(data_text8)

1701

In [80]:
len(data_text8[0])

10000

In [85]:
word2vec_model.build_vocab(data_text8, update=True)

In [87]:
word2vec_model.train(data_text8, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.iter)

(64096881, 85026035)

## Doc2vec model

* Create the tagged document needed for Doc2Vec

In [88]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [89]:
train_data_text8 = list(create_tagged_document(data_text8))

* Initialise the Doc2Vec model

In [90]:
d2v_text8_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

* Build the Volabulary

In [91]:
d2v_text8_model.build_vocab(train_data_text8)

* Train the Doc2Vec model

In [92]:
d2v_text8_model.train(train_data_text8, total_examples=d2v_text8_model.corpus_count, epochs=d2v_text8_model.epochs)

In [None]:
# joblib.dump(d2v_text8_model, 'd2v_t8_model')

In [93]:
print(d2v_text8_model.infer_vector(wk_1_para.split()))

[ 0.52938914  0.3504294  -0.5460172   1.0715247  -0.18014637  0.5919325
 -0.65860176 -0.11909512  0.6297231   0.08096617 -0.50141805  0.33079523
  0.26382217 -0.14258502 -0.09361993 -1.1417066  -1.3580422   0.44013304
 -0.21906562 -0.9525415  -1.3623505   0.23453736 -0.7632049   0.9310987
  0.68393     0.88379866  0.3406199   0.04687646  0.750225    1.2809547
 -0.24942893  1.6903898   0.44501072 -0.9188279   0.6768671   1.0363436
 -0.64551604  0.7180843   0.35293645 -0.37312868  0.32528517 -0.32484198
 -0.8942538   0.8002041   0.11280409 -0.11845278  0.06717791  0.30541998
 -0.60858387  0.06778439]


Soft cosine similarity is similar to cosine similarity but in addition considers the semantic relationship between the words through its vector representation.

## Compute soft cosine similarity

In [94]:
from gensim.matutils import softcossim
from gensim import corpora

* Prepare a dictionary of the documents

In [100]:
data_dict = corpora.Dictionary([new_data])

* Convert the documents into bag-of-words vectors

In [102]:
p_1_bow = data_dict.doc2bow(wk_1_para.split())
p_2_bow = data_dict.doc2bow(wk_2_para.split())
p_3_bow = data_dict.doc2bow(wk_3_para.split())

* Prepare the similarity matrix

In [105]:
similarity_matrix_1 = d2v_text8_model.wv.similarity_matrix(data_dict)

In [106]:
print(softcossim(p_1_bow, p_2_bow, similarity_matrix_1))

0.35627752001890833


In [107]:
print(softcossim(p_1_bow, p_3_bow, similarity_matrix_1))

0.6530419165029125


In [108]:
print(softcossim(p_2_bow, p_3_bow, similarity_matrix_1))

0.3220268829972466


## Using Scipy spatial distance to compute cosine of document vectors

In [109]:
from scipy import spatial

In [110]:
wk_1_vec = d2v_text8_model.infer_vector(wk_1_para.split())
wk_2_vec = d2v_text8_model.infer_vector(wk_2_para.split())
wk_3_vec = d2v_text8_model.infer_vector(wk_3_para.split())

In [111]:
spatial.distance.cosine(wk_1_vec, wk_2_vec)

0.42126381397247314

In [112]:
spatial.distance.cosine(wk_1_vec, wk_3_vec)

0.10909676551818848

In [113]:
spatial.distance.cosine(wk_2_vec, wk_3_vec)

0.4548901319503784

## Using Numpy dot product between 2 document vectors

In [115]:
np.dot(wk_1_vec, wk_2_vec)

11.813333

In [116]:
np.dot(wk_1_vec, wk_3_vec)

21.35476

In [117]:
np.dot(wk_2_vec, wk_3_vec)

11.4320755