## Import required modules

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer 

from pymongo import MongoClient

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rojin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load and Explore Data

In [2]:
df = pd.read_csv('Datasets/IMDB.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
print("Dimension of the data: ", df.shape)

no_of_rows = df.shape[0]
no_of_columns = df.shape[1]

print("No. of Rows: %d" % no_of_rows)
print("No. of Columns: %d" % no_of_columns)

Dimension of the data:  (50000, 2)
No. of Rows: 50000
No. of Columns: 2


## Convert Dataframe object into a 2D array of documents

In [5]:
# We take the review column for our text analysis
docs_array = df['review']

print("Dimension of the documents array: ", docs_array.shape)

Dimension of the documents array:  (50000,)


## Pre-process the data

## Tokenize the documents

In [6]:
# Function for convert a list of sentences to a list of lists containing tokenized words

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+') # Tokenize the words.
    
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

## Convert 2D document array into 1D array of Tokenized words

In [7]:
# Convert a list of sentences to a list of lists containing tokenized words
%time processed_docs = docs_preprocessor(docs_array)

Wall time: 1min 39s


In [8]:
print("Length of the 2D Array of Tokenized Documents: ", len(processed_docs))

Length of the 2D Array of Tokenized Documents:  50000


## Training the Word2Vec Model

In [10]:
# Set training parameters
size = 300       # Dimension of the word vector
window_size = 2  # We set it 2 as the sentences weren't too long
epochs = 100     # Number of iterations (epochs) over the corpus.
min_count = 40    # Ignores all words with total frequency lower than this.
workers = 4

# Train Word2vec model using gensim (use the skip-gram model: sg = 1)
%time model = Word2Vec(processed_docs, sg=1,window=window_size,size=size, min_count=min_count,workers=workers,iter=epochs,sample=0.01)

Wall time: 28min 40s


## Save the Model

In [None]:
model.save('Models/w2v_model_IMDB')

## Load the Saved Model

In [None]:
# Load word2vec model
model = Word2Vec.load('Models/w2v_model_IMDB')

## Model Evaluation

There is no easy way to evaluate the trained model.

One approach could be to see whether the model has learned the semantic representation of the words.

For this we use model's "wv" object. It contains the mapping between words and embeddings. It has several methods that we can use for our evaluation. https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.Word2VecKeyedVectors

## Evaluation 1: Find Similar Words

In [None]:
model.wv.most_similar('amazing')