I have taken 100 rows from my input text . Takes lot of time if i take 25k row.

In [1]:
# Firstly, please note that the performance of google word2vec is better on big datasets. 
# In this example we are considering only 25000 training examples from the imdb dataset.
# Therefore, the performance is similar to the "bag of words" model.

# Importing libraries
import numpy as np
import pandas as pd
# BeautifulSoup is used to remove html tags from the text
from bs4 import BeautifulSoup 
import re # For regular expressions

# Stopwords can be useful to undersand the semantics of the sentence.
# Therefore stopwords are not removed while creating the word2vec model.
# But they will be removed  while averaging feature vectors.
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read data from files
train = pd.read_csv(r"C:\Users\I324158\Downloads\testData.tsv\testData.tsv", header=0,\
                    delimiter="\t", quoting=3)



In [8]:
train = train.iloc[0:100,:]


100

In [9]:
# This function converts a text to a sequence of words.
def review_wordlist(review, remove_stopwords=False):
    # 1. Removing html tags
    review_text = BeautifulSoup(review).get_text()
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        stops = set(stopwords.words("english"))     
        words = [w for w in words if not w in stops]
    
    return(words)

In [10]:

# word2vec expects a list of lists.
# Using punkt tokenizer for better splitting of a paragraph into sentences.

import nltk.data
#nltk.download('popular')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


In [11]:
# This function splits a review into sentences
def review_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Using nltk tokenizer
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    # 2. Loop for each sentence
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_wordlist(raw_sentence,\
                                            remove_stopwords))

    # This returns the list of lists
    return sentences


In [12]:

sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_sentences(review, tokenizer)
    

Parsing sentences from training set


In [13]:
# type(sentences)
sentences[0]

['naturally',
 'in',
 'a',
 'film',
 'who',
 's',
 'main',
 'themes',
 'are',
 'of',
 'mortality',
 'nostalgia',
 'and',
 'loss',
 'of',
 'innocence',
 'it',
 'is',
 'perhaps',
 'not',
 'surprising',
 'that',
 'it',
 'is',
 'rated',
 'more',
 'highly',
 'by',
 'older',
 'viewers',
 'than',
 'younger',
 'ones']

In [14]:
# Importing the built-in logging module
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [15]:
# Creating the model and setting values for the various parameters
num_features = 300  # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4     # Number of parallel threads
context = 10        # Context window size
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
print("Training model....")
model = word2vec.Word2Vec(sentences,\
                          workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,
                          sample=downsampling)

# To make the model memory efficient
model.init_sims(replace=True)

# Saving the model for later use. Can be loaded using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)


2020-01-11 16:39:10,866 : INFO : collecting all words and their counts
2020-01-11 16:39:10,867 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-11 16:39:10,889 : INFO : collected 4661 word types from a corpus of 23423 raw words and 991 sentences
2020-01-11 16:39:10,891 : INFO : Loading a fresh vocabulary
2020-01-11 16:39:10,899 : INFO : effective_min_count=40 retains 78 unique words (1% of original 4661, drops 4583)
2020-01-11 16:39:10,901 : INFO : effective_min_count=40 leaves 11476 word corpus (48% of original 23423, drops 11947)
2020-01-11 16:39:10,903 : INFO : deleting the raw counts dictionary of 4661 items
2020-01-11 16:39:10,905 : INFO : sample=0.001 downsamples 78 most-common words
2020-01-11 16:39:10,906 : INFO : downsampling leaves estimated 3736 word corpus (32.6% of prior 11476)
2020-01-11 16:39:10,908 : INFO : estimated required memory for 78 words and 300 dimensions: 226200 bytes
2020-01-11 16:39:10,910 : INFO : resetting layer weights
20

Training model....


2020-01-11 16:39:11,045 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-01-11 16:39:11,050 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-01-11 16:39:11,051 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-11 16:39:11,054 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-01-11 16:39:11,055 : INFO : EPOCH - 5 : training on 23423 raw words (3654 effective words) took 0.0s, 263458 effective words/s
2020-01-11 16:39:11,056 : INFO : training on a 117115 raw words (18578 effective words) took 0.1s, 139927 effective words/s
2020-01-11 16:39:11,059 : INFO : precomputing L2-norms of word weight vectors
2020-01-11 16:39:11,081 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2020-01-11 16:39:11,082 : INFO : not storing attribute vectors_norm
2020-01-11 16:39:11,083 : INFO : not storing attribute cum_table
2020-01-11 16:39:11,097 : INFO : saved 300features_40minw

In [16]:
# Few tests: This will print the odd word among them 
model.wv.doesnt_match("man woman dog child kitchen".split())





ValueError: cannot select a word from an empty list

In [17]:
model.wv.doesnt_match("france england germany berlin".split())




ValueError: cannot select a word from an empty list

In [18]:
# This will print the most similar words present in the model
model.wv.most_similar("man")


KeyError: "word 'man' not in vocabulary"

In [19]:
model.wv.most_similar("awful")


KeyError: "word 'awful' not in vocabulary"

In [20]:
# This will give the total number of words in the vocabolary created from this dataset
model.wv.syn0.shape
# set(model.wv.index2word)

(78, 300)

In [43]:
model.wv['film']

array([ 2.61664037e-02,  8.24532807e-02,  3.76179861e-03,  5.81793138e-04,
        3.64488550e-02,  5.70453629e-02,  6.94968458e-03,  9.85511839e-02,
        3.93022858e-02,  4.59852293e-02, -1.03451647e-01,  1.65633857e-02,
       -4.80380058e-02,  8.45082924e-02, -4.23093960e-02, -7.86713809e-02,
       -1.45126462e-01,  1.10818952e-01, -2.29660925e-02,  4.75815535e-02,
       -2.11016741e-02,  2.50058789e-02,  4.41681854e-02, -4.76989802e-03,
       -5.67844845e-02,  3.12639922e-02,  2.37833560e-02,  2.79293414e-02,
        2.40612868e-02,  4.30039410e-03,  9.33803394e-02,  1.56919323e-02,
       -7.38396868e-02,  3.92112508e-03,  4.32204232e-02, -8.48979130e-02,
        7.38167986e-02,  7.16893524e-02,  3.33599672e-02, -5.46611957e-02,
        4.71505560e-02, -8.08743862e-05,  4.48456109e-02, -2.93611782e-03,
       -1.40388580e-02, -6.77136406e-02, -3.75110917e-02, -5.26984185e-02,
       -5.28386608e-02,  3.98561135e-02,  4.44052145e-02,  3.43946703e-02,
       -5.58889396e-02, -

In [45]:
model.wv.index2word

['the',
 'a',
 'and',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 's',
 'as',
 'with',
 'but',
 'for',
 'was',
 'movie',
 'you',
 'he',
 'film',
 'are',
 'on',
 't',
 'his',
 'not',
 'one',
 'have',
 'be',
 'at',
 'all',
 'there',
 'by',
 'her',
 'who',
 'from',
 'they',
 'so',
 'if',
 'like',
 'some',
 'has',
 'just',
 'an',
 'or',
 'story',
 'when',
 'can',
 'very',
 'what',
 'about',
 'my',
 'out',
 'see',
 'she',
 'more',
 'me',
 'do',
 'we',
 'good',
 'really',
 'well',
 'would',
 'had',
 'into',
 'which',
 'movies',
 'its',
 'only',
 'time',
 'first',
 'up',
 'no',
 'character',
 'were',
 'then',
 'him',
 'get']

In [58]:
import pdb
# Function to average all word vectors in a paragraph
def featureVecMethod(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
#             pdb.set_trace()
#             print(model[word])
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec



In [61]:
# Function for calculating the average feature vector
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        # Printing a status message every 1000th review
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))
            
        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1
#         pdb.set_trace()
    return reviewFeatureVecs


In [62]:
# Calculating average feature vector for training set
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(review_wordlist(review, remove_stopwords=True))
    
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

Review 0 of 100


In [50]:
type(trainDataVecs)

numpy.ndarray

In [28]:
# Calculating average feature vactors for test set     
# clean_test_reviews = []
# for review in test["review"]:
#     clean_test_reviews.append(review_wordlist(review,remove_stopwords=True))
    
# testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

In [54]:
train.columns

Index(['id', 'review'], dtype='object')

In [18]:
# Fitting a random forest classifier to the training data
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
    
print("Fitting random forest to training data....")    
forest = forest.fit(trainDataVecs, train["sentiment"])
    

Fitting random forest to training data....


In [19]:
# Predicting the sentiment values for test data and saving the results in a csv file 
result = forest.predict(testDataVecs)
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
output.to_csv( "output.csv", index=False, quoting=3 )