In [2]:
import pandas as pd

# Read data from files
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

# Verify the number of reviews that were read (100,000 in total)
print("Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [10]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def review_to_wordlist(raw_reviews, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML Tags
    review_text = BeautifulSoup(raw_reviews, "lxml").get_text()
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # 6. Lemmatize the words
    lemmatized_words = [lemmatizer.lemmatize(w) for w in words]
    
    # 5. Return a list of words
    return(words)

In [11]:
import nltk
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [12]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer, True)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer, True)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [13]:
print(len(sentences))
print(sentences[0])
print(sentences[1])

795538
['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker']
['maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent']


In [15]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-08-07 14:48:53,200 : INFO : 'pattern' package not found; tag filters are not available for English
2017-08-07 14:48:53,223 : INFO : collecting all words and their counts
2017-08-07 14:48:53,225 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-07 14:48:53,278 : INFO : PROGRESS: at sentence #10000, processed 114931 words, keeping 17627 word types
2017-08-07 14:48:53,323 : INFO : PROGRESS: at sentence #20000, processed 228988 words, keeping 24797 word types
2017-08-07 14:48:53,352 : INFO : PROGRESS: at sentence #30000, processed 339533 words, keeping 29883 word types
2017-08-07 14:48:53,393 : INFO : PROGRESS: at sentence #40000, processed 453983 words, keeping 34196 word types


Training model...


2017-08-07 14:48:53,431 : INFO : PROGRESS: at sentence #50000, processed 565006 words, keeping 37609 word types
2017-08-07 14:48:53,476 : INFO : PROGRESS: at sentence #60000, processed 676637 words, keeping 40571 word types
2017-08-07 14:48:53,512 : INFO : PROGRESS: at sentence #70000, processed 789005 words, keeping 43180 word types
2017-08-07 14:48:53,540 : INFO : PROGRESS: at sentence #80000, processed 899771 words, keeping 45561 word types
2017-08-07 14:48:53,587 : INFO : PROGRESS: at sentence #90000, processed 1013453 words, keeping 47982 word types
2017-08-07 14:48:53,625 : INFO : PROGRESS: at sentence #100000, processed 1125135 words, keeping 50054 word types
2017-08-07 14:48:53,663 : INFO : PROGRESS: at sentence #110000, processed 1236261 words, keeping 51928 word types
2017-08-07 14:48:53,700 : INFO : PROGRESS: at sentence #120000, processed 1348541 words, keeping 53966 word types
2017-08-07 14:48:53,737 : INFO : PROGRESS: at sentence #130000, processed 1461911 words, keeping 

2017-08-07 14:48:56,335 : INFO : PROGRESS: at sentence #770000, processed 8700018 words, keeping 121550 word types
2017-08-07 14:48:56,369 : INFO : PROGRESS: at sentence #780000, processed 8816138 words, keeping 122249 word types
2017-08-07 14:48:56,414 : INFO : PROGRESS: at sentence #790000, processed 8930668 words, keeping 122913 word types
2017-08-07 14:48:56,446 : INFO : collected 123351 word types from a corpus of 8993057 raw words and 795538 sentences
2017-08-07 14:48:56,448 : INFO : Loading a fresh vocabulary
2017-08-07 14:48:56,639 : INFO : min_count=40 retains 16340 unique words (13% of original 123351, drops 107011)
2017-08-07 14:48:56,640 : INFO : min_count=40 leaves 8433955 word corpus (93% of original 8993057, drops 559102)
2017-08-07 14:48:56,733 : INFO : deleting the raw counts dictionary of 123351 items
2017-08-07 14:48:56,744 : INFO : sample=0.001 downsamples 24 most-common words
2017-08-07 14:48:56,745 : INFO : downsampling leaves estimated 8064172 word corpus (95.6% 

2017-08-07 14:50:05,999 : INFO : PROGRESS: at 88.92% examples, 520341 words/s, in_qsize 7, out_qsize 0
2017-08-07 14:50:07,002 : INFO : PROGRESS: at 90.20% examples, 520310 words/s, in_qsize 8, out_qsize 0
2017-08-07 14:50:08,010 : INFO : PROGRESS: at 91.46% examples, 520108 words/s, in_qsize 7, out_qsize 0
2017-08-07 14:50:09,062 : INFO : PROGRESS: at 92.65% examples, 519216 words/s, in_qsize 7, out_qsize 1
2017-08-07 14:50:10,070 : INFO : PROGRESS: at 93.69% examples, 517813 words/s, in_qsize 7, out_qsize 0
2017-08-07 14:50:11,084 : INFO : PROGRESS: at 94.75% examples, 516532 words/s, in_qsize 7, out_qsize 0
2017-08-07 14:50:12,094 : INFO : PROGRESS: at 95.99% examples, 516248 words/s, in_qsize 7, out_qsize 0
2017-08-07 14:50:13,113 : INFO : PROGRESS: at 97.24% examples, 515935 words/s, in_qsize 7, out_qsize 0
2017-08-07 14:50:14,129 : INFO : PROGRESS: at 98.50% examples, 515771 words/s, in_qsize 7, out_qsize 0
2017-08-07 14:50:15,143 : INFO : PROGRESS: at 99.83% examples, 515946 wor

In [16]:
model.doesnt_match("man woman child kitchen".split())

'kitchen'

In [17]:
model.doesnt_match("france england germany berlin".split())

'berlin'

In [18]:
model.doesnt_match("paris berlin london austria".split())

'london'

In [19]:
model.most_similar("man")

[('lady', 0.5552449226379395),
 ('men', 0.5232553482055664),
 ('woman', 0.5166194438934326),
 ('lad', 0.47608307003974915),
 ('mans', 0.46191689372062683),
 ('monk', 0.4449968934059143),
 ('guy', 0.42385101318359375),
 ('person', 0.4234864115715027),
 ('farmer', 0.41675615310668945),
 ('widow', 0.40869805216789246)]

In [20]:
model.most_similar("queen")

[('latifah', 0.6391854286193848),
 ('princess', 0.6235027313232422),
 ('bride', 0.5827382802963257),
 ('prince', 0.5814833641052246),
 ('heiress', 0.5804899334907532),
 ('goddess', 0.5770130157470703),
 ('mistress', 0.5761330127716064),
 ('monarch', 0.55811607837677),
 ('auntie', 0.5540074110031128),
 ('queens', 0.5527753829956055)]

In [21]:
model.most_similar("awful")

[('terrible', 0.7863554954528809),
 ('atrocious', 0.7167770862579346),
 ('dreadful', 0.7135769724845886),
 ('horrible', 0.7100532054901123),
 ('horrid', 0.7070688605308533),
 ('abysmal', 0.7010518312454224),
 ('horrendous', 0.6881977319717407),
 ('appalling', 0.6536094546318054),
 ('lousy', 0.6527203321456909),
 ('crappy', 0.6411494016647339)]