In [17]:
import pandas as pd
import numpy as np

In [18]:
train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)

unlabeled_train.shape

(50000, 2)

In [19]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordList(raw_review):
    review_text = BeautifulSoup(raw_review).get_text()
    
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    
    words = letters_only.lower().split()
                
    return words

In [26]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer):
    # using nltk tokenizer to split paragraphs into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentence = []
    
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentence.append(review_to_wordList(raw_sentence))
    
    return sentence

In [21]:
tokenizer.tokenize(BeautifulSoup(train['review'][0]).get_text())

[u'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.',
 u'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.',
 u'Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released.',
 u"Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring.",
 u'Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally sta

In [33]:
sentences = []  # Initialize an empty list of sentences

for review in train["review"]:
    sentences += review_to_sentences(review.decode('utf8'), tokenizer)

for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review.decode('utf8'), tokenizer)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [39]:
print len(sentences)

795538


In [40]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words


from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, 
                          window=context, sample=downsampling)

model.init_sims(replace=True)

model_name = "300features_40minwords_10context"
model.save(model_name)

2017-01-07 18:05:03,862 : INFO : 'pattern' package not found; tag filters are not available for English
2017-01-07 18:05:04,087 : INFO : collecting all words and their counts
2017-01-07 18:05:04,088 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-07 18:05:04,264 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2017-01-07 18:05:04,347 : INFO : PROGRESS: at sentence #20000, processed 451887 words, keeping 24948 word types
2017-01-07 18:05:04,421 : INFO : PROGRESS: at sentence #30000, processed 671310 words, keeping 30034 word types
2017-01-07 18:05:04,499 : INFO : PROGRESS: at sentence #40000, processed 897810 words, keeping 34348 word types
2017-01-07 18:05:04,593 : INFO : PROGRESS: at sentence #50000, processed 1116958 words, keeping 37761 word types
2017-01-07 18:05:04,678 : INFO : PROGRESS: at sentence #60000, processed 1338399 words, keeping 40723 word types
2017-01-07 18:05:04,841 : INFO : PROGRESS: at senten

In [42]:
model.doesnt_match('man woman child kitchen'.split())

'kitchen'

In [45]:
model.doesnt_match('india paris england berlin'.split())

'berlin'

In [50]:
model.most_similar('man')

[(u'woman', 0.6119527816772461),
 (u'lady', 0.5812598466873169),
 (u'lad', 0.5498437881469727),
 (u'millionaire', 0.5329727530479431),
 (u'guy', 0.5293763875961304),
 (u'monk', 0.5229437351226807),
 (u'men', 0.514629602432251),
 (u'farmer', 0.5098802447319031),
 (u'businessman', 0.5018507838249207),
 (u'soldier', 0.4967585504055023)]

In [52]:
model.most_similar('terrible')

[(u'horrible', 0.8846541047096252),
 (u'atrocious', 0.79456627368927),
 (u'dreadful', 0.7850203514099121),
 (u'awful', 0.7802629470825195),
 (u'horrendous', 0.7797378301620483),
 (u'horrid', 0.7629272937774658),
 (u'lousy', 0.7255361080169678),
 (u'abysmal', 0.7225754261016846),
 (u'laughable', 0.670087993144989),
 (u'bad', 0.6683333516120911)]