In [3]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [4]:
train=pd.read_csv("../BagofWords/labeledTrainData.tsv",header=0,delimiter="\t",quoting=3)
unlabeled_train=pd.read_csv("../BagofWords/unlabeledTrainData.tsv",header=0,delimiter="\t",quoting=3)
test=pd.read_csv("../BagofWords/testData.tsv",header=0,delimiter="\t",quoting=3)

In [20]:
def review_to_words(raw_review,remove_stopwords=False):
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text=BeautifulSoup(raw_review).get_text()
    #2. Remove non letters
    letters_only=re.sub("[^a-zA-Z]"," ",review_text)
    #3. Convert to lower_case and split to individual letters
    words=letters_only.lower().split()
    
    if remove_stopwords:
        stops=set(stopwords.words("english"))
        words = [w for w in words if not w in stops] 
    
    return words

 Word2Vec expects single sentences, each one as a list of words.

We will use NLTK's punkt tokenizer for sentence splitting.

In [21]:
#Function to split a review into parsed sentences. Returns list of sentences, where each sentence is a list of words
def review_to_sentence(review,tokenizer,remove_stopwords=False):
    # Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences=tokenizer.tokenize(review.strip())
        
        sentences=[]
        for raw_sentence in raw_sentences:
            if(len(raw_sentence)>0):
                sentences.append(review_to_words(raw_sentence,remove_stopwords))
                
        return sentences

In [22]:
def build_dataset(data,tokenizer):
    train_reviews=data["review"]
    sentences=[]
    for review in train_reviews:
        sentences+=review_to_sentence(review,tokenizer)
    return sentences
        

In [23]:
import nltk
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
labeled_sentences=build_dataset(train,tokenizer)
unlabeled_sentences=build_dataset(unlabeled_train,tokenizer)
sentences=labeled_sentences+unlabeled_sentences



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


<b> Hyperparameters for word2vec </b>
<p>Architecture: Architecture options are skip-gram (default) or continuous bag of words. We found that skip-gram was very slightly slower but produced better results.</p>
<p>Training algorithm: Hierarchical softmax (default) or negative sampling. For us, the default worked well.</p>
<p>Downsampling of frequent words: The Google documentation recommends values between .00001 and .001. For us, values closer 0.001 seemed to improve the accuracy of the final model.</p>
<p>Word vector dimensionality: More features result in longer runtimes, and often, but not always, result in better models. Reasonable values can be in the tens to hundreds; we used 300.</p>
<p>Context / window size: How many words of context should the training algorithm take into account? 10 seems to work well for hierarchical softmax (more is better, up to a point).</p>
<p>Worker threads: Number of parallel processes to run. This is computer-specific, but between 4 and 6 should work on most systems.</p>
<p>Minimum word count: This helps limit the size of the vocabulary to meaningful words. Any word that does not occur at least this many times across all documents is ignored. Reasonable values could be between 10 and 100. In this case, since each movie occurs 30 times, we set the minimum word count to 40, to avoid attaching too much importance to individual movie titles. This resulted in an overall vocabulary size of around 15,000 words. Higher values also help limit run time.<p>

In [24]:
# Import the built-in logging module and configure it so that Word2Vec creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [25]:
#set values for various parameters
num_features=300     # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words


In [26]:
from gensim.models import word2vec
print("training model..")
model=word2vec.Word2Vec(sentences,workers=num_workers,size=num_features,
                        min_count=min_word_count,window=context,sample=downsampling)

2018-01-11 07:33:43,372 : INFO : collecting all words and their counts
2018-01-11 07:33:43,375 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-11 07:33:43,440 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2018-01-11 07:33:43,489 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-01-11 07:33:43,537 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types


training model..


2018-01-11 07:33:43,586 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2018-01-11 07:33:43,648 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2018-01-11 07:33:43,709 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2018-01-11 07:33:43,772 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2018-01-11 07:33:43,831 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2018-01-11 07:33:43,898 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2018-01-11 07:33:43,959 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2018-01-11 07:33:44,017 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types
2018-01-11 07:33:44,076 : INFO : PROGRESS: at sentence #120000, processed 2668775 words, keepin

2018-01-11 07:33:48,281 : INFO : PROGRESS: at sentence #760000, processed 16990622 words, keeping 120930 word types
2018-01-11 07:33:48,365 : INFO : PROGRESS: at sentence #770000, processed 17217759 words, keeping 121703 word types
2018-01-11 07:33:48,427 : INFO : PROGRESS: at sentence #780000, processed 17447905 words, keeping 122402 word types
2018-01-11 07:33:48,487 : INFO : PROGRESS: at sentence #790000, processed 17674981 words, keeping 123066 word types
2018-01-11 07:33:48,518 : INFO : collected 123504 word types from a corpus of 17798082 raw words and 795538 sentences
2018-01-11 07:33:48,519 : INFO : Loading a fresh vocabulary
2018-01-11 07:33:49,782 : INFO : min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2018-01-11 07:33:49,783 : INFO : min_count=40 leaves 17238940 word corpus (96% of original 17798082, drops 559142)
2018-01-11 07:33:49,902 : INFO : deleting the raw counts dictionary of 123504 items
2018-01-11 07:33:49,908 : INFO : sample=0.001 d

2018-01-11 07:34:57,830 : INFO : PROGRESS: at 57.05% examples, 538473 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:34:58,853 : INFO : PROGRESS: at 57.93% examples, 538676 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:34:59,854 : INFO : PROGRESS: at 58.78% examples, 538741 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:35:00,855 : INFO : PROGRESS: at 59.65% examples, 538894 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:35:01,862 : INFO : PROGRESS: at 60.51% examples, 539108 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:35:02,871 : INFO : PROGRESS: at 61.39% examples, 539294 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:35:03,898 : INFO : PROGRESS: at 62.28% examples, 539340 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:35:04,907 : INFO : PROGRESS: at 63.16% examples, 539540 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:35:05,918 : INFO : PROGRESS: at 64.04% examples, 539689 words/s, in_qsize 7, out_qsize 0
2018-01-11 07:35:06,926 : INFO : PROGRESS: at 64.90% examples, 539696 wor

In [27]:
# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-01-11 07:37:21,331 : INFO : precomputing L2-norms of word weight vectors
2018-01-11 07:37:21,516 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2018-01-11 07:37:21,517 : INFO : not storing attribute syn0norm
2018-01-11 07:37:21,518 : INFO : not storing attribute cum_table
2018-01-11 07:37:23,583 : INFO : saved 300features_40minwords_10context


In [28]:
model.most_similar("good")

  """Entry point for launching an IPython kernel.


[('decent', 0.6780563592910767),
 ('bad', 0.6295445561408997),
 ('great', 0.6052792072296143),
 ('nice', 0.5935851335525513),
 ('lousy', 0.5736414790153503),
 ('cool', 0.5626237988471985),
 ('mediocre', 0.5481599569320679),
 ('fine', 0.5412446856498718),
 ('passable', 0.533780574798584),
 ('solid', 0.5323073267936707)]

In [30]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [32]:
model.most_similar("boring")

  """Entry point for launching an IPython kernel.


[('dull', 0.8046208620071411),
 ('tedious', 0.7967912554740906),
 ('pointless', 0.7260136604309082),
 ('uninteresting', 0.685673713684082),
 ('repetitive', 0.6551077365875244),
 ('confusing', 0.6518893241882324),
 ('tiresome', 0.6437580585479736),
 ('predictable', 0.6389776468276978),
 ('bland', 0.6065695285797119),
 ('tiring', 0.6009296178817749)]