In [1]:
import pandas as pd

train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
test = pd.read_csv('testData.tsv', header=0, delimiter='\t', quoting=3)
unlabeled_train = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

print (" Reading {} labeled reviews, {} test reviews, {} unlabeled reviews".format(train["review"].size, test["review"].size, unlabeled_train["review"].size))

 Reading 25000 labeled reviews, 25000 test reviews, 50000 unlabeled reviews


In [2]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):

    review_text = BeautifulSoup(review).get_text()

    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    words = review_text.lower().split()

    return(words)

In [3]:
import nltk.data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/rahul/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    
    return sentences

In [9]:
sentences = []

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled data")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)
    


Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled data


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [10]:
print(len(sentences))

795538


In [12]:
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [14]:
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers = num_workers, size = num_features, min_count = min_word_count, window = context, sample = downsampling)
model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-10-26 16:11:47,689 : INFO : collecting all words and their counts
2018-10-26 16:11:47,690 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-26 16:11:47,760 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2018-10-26 16:11:47,824 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-10-26 16:11:47,881 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types


Training model...


2018-10-26 16:11:47,941 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2018-10-26 16:11:48,015 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2018-10-26 16:11:48,074 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2018-10-26 16:11:48,132 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2018-10-26 16:11:48,196 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2018-10-26 16:11:48,254 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2018-10-26 16:11:48,314 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2018-10-26 16:11:48,374 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types
2018-10-26 16:11:48,439 : INFO : PROGRESS: at sentence #120000, processed 2668775 words, keepin

2018-10-26 16:11:52,471 : INFO : PROGRESS: at sentence #760000, processed 16990622 words, keeping 120930 word types
2018-10-26 16:11:52,537 : INFO : PROGRESS: at sentence #770000, processed 17217759 words, keeping 121703 word types
2018-10-26 16:11:52,601 : INFO : PROGRESS: at sentence #780000, processed 17447905 words, keeping 122402 word types
2018-10-26 16:11:52,662 : INFO : PROGRESS: at sentence #790000, processed 17674981 words, keeping 123066 word types
2018-10-26 16:11:52,696 : INFO : collected 123504 word types from a corpus of 17798082 raw words and 795538 sentences
2018-10-26 16:11:52,696 : INFO : Loading a fresh vocabulary
2018-10-26 16:11:52,787 : INFO : effective_min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2018-10-26 16:11:52,788 : INFO : effective_min_count=40 leaves 17238940 word corpus (96% of original 17798082, drops 559142)
2018-10-26 16:11:52,846 : INFO : deleting the raw counts dictionary of 123504 items
2018-10-26 16:11:52,851 : I

2018-10-26 16:12:46,578 : INFO : EPOCH - 2 : training on 17798082 raw words (12750380 effective words) took 26.4s, 483594 effective words/s
2018-10-26 16:12:47,615 : INFO : EPOCH 3 - PROGRESS: at 3.81% examples, 472749 words/s, in_qsize 6, out_qsize 1
2018-10-26 16:12:48,655 : INFO : EPOCH 3 - PROGRESS: at 7.88% examples, 483752 words/s, in_qsize 8, out_qsize 2
2018-10-26 16:12:49,668 : INFO : EPOCH 3 - PROGRESS: at 11.95% examples, 491995 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:12:50,683 : INFO : EPOCH 3 - PROGRESS: at 15.84% examples, 490716 words/s, in_qsize 6, out_qsize 1
2018-10-26 16:12:51,703 : INFO : EPOCH 3 - PROGRESS: at 19.71% examples, 487813 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:12:52,720 : INFO : EPOCH 3 - PROGRESS: at 23.21% examples, 479350 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:12:53,724 : INFO : EPOCH 3 - PROGRESS: at 27.36% examples, 485984 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:12:54,731 : INFO : EPOCH 3 - PROGRESS: at 31.20% examples

2018-10-26 16:13:52,177 : INFO : EPOCH 5 - PROGRESS: at 53.83% examples, 520756 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:13:53,181 : INFO : EPOCH 5 - PROGRESS: at 57.90% examples, 521166 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:13:54,181 : INFO : EPOCH 5 - PROGRESS: at 61.97% examples, 521235 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:13:55,197 : INFO : EPOCH 5 - PROGRESS: at 66.12% examples, 521203 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:13:56,212 : INFO : EPOCH 5 - PROGRESS: at 70.22% examples, 520823 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:13:57,224 : INFO : EPOCH 5 - PROGRESS: at 74.40% examples, 521357 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:13:58,253 : INFO : EPOCH 5 - PROGRESS: at 78.61% examples, 521345 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:13:59,261 : INFO : EPOCH 5 - PROGRESS: at 82.77% examples, 521524 words/s, in_qsize 7, out_qsize 0
2018-10-26 16:14:00,265 : INFO : EPOCH 5 - PROGRESS: at 86.81% examples, 521126 words/s, in_qsiz

In [15]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


'kitchen'

In [16]:
model.doesnt_match("paris berlin london austria".split())

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


'austria'

In [23]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.
2018-10-26 18:01:25,442 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.6087762713432312),
 ('lady', 0.5881526470184326),
 ('lad', 0.5773169994354248),
 ('farmer', 0.5383074283599854),
 ('millionaire', 0.5143644213676453),
 ('guy', 0.5108907222747803),
 ('monk', 0.5084243416786194),
 ('person', 0.508341908454895),
 ('men', 0.5066123008728027),
 ('soldier', 0.4977167248725891)]

In [30]:
import numpy as np
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")

type(model)


2018-10-26 18:26:42,504 : INFO : loading Word2Vec object from 300features_40minwords_10context
2018-10-26 18:26:42,864 : INFO : loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2018-10-26 18:26:42,865 : INFO : setting ignored attribute vectors_norm to None
2018-10-26 18:26:42,866 : INFO : loading vocabulary recursively from 300features_40minwords_10context.vocabulary.* with mmap=None
2018-10-26 18:26:42,866 : INFO : loading trainables recursively from 300features_40minwords_10context.trainables.* with mmap=None
2018-10-26 18:26:42,867 : INFO : setting ignored attribute cum_table to None
2018-10-26 18:26:42,869 : INFO : loaded 300features_40minwords_10context


gensim.models.word2vec.Word2Vec

In [None]:
def makeFeatureVec(words, model, num_features):
    