In [5]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
import logging
from gensim.models import word2vec

In [6]:
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)

unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [7]:
def review_to_wordlist(review, remove_stopwords=False):
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z0-9]"," ", review_text)
    
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)

In [8]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())

    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    
    return sentences

In [14]:
sentences = []

print "Parsing sentences from training set"
for review in train["review"]:
    sentences += review_to_sentences(review.decode("utf8"), tokenizer)

print "Parsing sentences from unlabeled set"
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review.decode("utf8"), tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [16]:
print(len(sentences))
print(sentences[0])

795538
[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again']


In [17]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

num_features = 400    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 12          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [18]:
print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers,
                          size=num_features, min_count = min_word_count,
                          window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)


model_name = "w2v"
model.save(model_name)

2017-11-02 17:46:59,627 : INFO : collecting all words and their counts
2017-11-02 17:46:59,628 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2017-11-02 17:47:00,173 : INFO : PROGRESS: at sentence #10000, processed 227240 words, keeping 18038 word types
2017-11-02 17:47:00,305 : INFO : PROGRESS: at sentence #20000, processed 454577 words, keeping 25324 word types
2017-11-02 17:47:00,464 : INFO : PROGRESS: at sentence #30000, processed 675275 words, keeping 30478 word types
2017-11-02 17:47:00,570 : INFO : PROGRESS: at sentence #40000, processed 903015 words, keeping 34863 word types
2017-11-02 17:47:00,659 : INFO : PROGRESS: at sentence #50000, processed 1123504 words, keeping 38329 word types
2017-11-02 17:47:00,781 : INFO : PROGRESS: at sentence #60000, processed 1346265 words, keeping 41338 word types
2017-11-02 17:47:00,901 : INFO : PROGRESS: at sentence #70000, processed 1570739 words, keeping 43986 word types
2017-11-02 17:47:01,047 : INFO : PROGRESS: at sentence #80000, processed 1791249 words, keeping 46400 word types
2017-11-02 17:47:01,161 : INFO : PROGRESS: at sentence #90000, processed 2016723 words, keeping 4886

2017-11-02 17:47:09,707 : INFO : PROGRESS: at sentence #730000, processed 16426860 words, keeping 121513 word types
2017-11-02 17:47:09,811 : INFO : PROGRESS: at sentence #740000, processed 16649236 words, keeping 122242 word types
2017-11-02 17:47:09,884 : INFO : PROGRESS: at sentence #750000, processed 16868896 words, keeping 122892 word types
2017-11-02 17:47:10,016 : INFO : PROGRESS: at sentence #760000, processed 17089573 words, keeping 123538 word types
2017-11-02 17:47:10,133 : INFO : PROGRESS: at sentence #770000, processed 17318060 words, keeping 124325 word types
2017-11-02 17:47:10,243 : INFO : PROGRESS: at sentence #780000, processed 17549563 words, keeping 125051 word types
2017-11-02 17:47:10,324 : INFO : PROGRESS: at sentence #790000, processed 17777883 words, keeping 125739 word types
2017-11-02 17:47:10,386 : INFO : collected 126186 word types from a corpus of 17901685 raw words and 795538 sentences
2017-11-02 17:47:10,391 : INFO : Loading a fresh vocabulary
2017-11-02

2017-11-02 17:48:18,022 : INFO : PROGRESS: at 26.35% examples, 258334 words/s, in_qsize 7, out_qsize 0
2017-11-02 17:48:19,026 : INFO : PROGRESS: at 26.95% examples, 260240 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:48:20,058 : INFO : PROGRESS: at 27.55% examples, 262001 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:48:21,068 : INFO : PROGRESS: at 28.14% examples, 263688 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:48:22,074 : INFO : PROGRESS: at 28.70% examples, 265134 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:48:23,079 : INFO : PROGRESS: at 29.29% examples, 266641 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:48:24,086 : INFO : PROGRESS: at 29.86% examples, 268104 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:48:25,095 : INFO : PROGRESS: at 30.46% examples, 269617 words/s, in_qsize 7, out_qsize 0
2017-11-02 17:48:26,111 : INFO : PROGRESS: at 31.04% examples, 271059 words/s, in_qsize 7, out_qsize 0
2017-11-02 17:48:27,136 : INFO : PROGRESS: at 31.63% examples, 272523 wor

2017-11-02 17:49:39,639 : INFO : PROGRESS: at 66.37% examples, 289987 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:49:40,653 : INFO : PROGRESS: at 66.71% examples, 289450 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:49:41,702 : INFO : PROGRESS: at 67.05% examples, 288904 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:49:42,716 : INFO : PROGRESS: at 67.40% examples, 288436 words/s, in_qsize 7, out_qsize 0
2017-11-02 17:49:43,727 : INFO : PROGRESS: at 67.78% examples, 288123 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:49:44,756 : INFO : PROGRESS: at 68.14% examples, 287729 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:49:45,781 : INFO : PROGRESS: at 68.51% examples, 287347 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:49:46,796 : INFO : PROGRESS: at 68.85% examples, 286900 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:49:47,831 : INFO : PROGRESS: at 69.19% examples, 286375 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:49:48,881 : INFO : PROGRESS: at 69.56% examples, 286009 wor

2017-11-02 17:51:01,662 : INFO : PROGRESS: at 94.37% examples, 264865 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:51:02,677 : INFO : PROGRESS: at 94.70% examples, 264631 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:51:03,753 : INFO : PROGRESS: at 95.05% examples, 264395 words/s, in_qsize 6, out_qsize 2
2017-11-02 17:51:04,766 : INFO : PROGRESS: at 95.41% examples, 264199 words/s, in_qsize 8, out_qsize 2
2017-11-02 17:51:05,765 : INFO : PROGRESS: at 95.76% examples, 264020 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:51:06,782 : INFO : PROGRESS: at 96.11% examples, 263851 words/s, in_qsize 7, out_qsize 0
2017-11-02 17:51:07,864 : INFO : PROGRESS: at 96.48% examples, 263646 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:51:08,866 : INFO : PROGRESS: at 96.84% examples, 263498 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:51:09,899 : INFO : PROGRESS: at 97.18% examples, 263290 words/s, in_qsize 8, out_qsize 0
2017-11-02 17:51:10,900 : INFO : PROGRESS: at 97.52% examples, 263087 wor