In [None]:
import pandas as pd

# Read data from files 
train = pd.read_csv( "../input/word2vec-tutorial-suite/labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )
test = pd.read_csv( "../input/word2vec-tutorial-suite/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "../input/word2vec-tutorial-suite/unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

# Verify the number of reviews that were read (100,000 in total)
print("Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

In [None]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return words

In [None]:
# Download the punkt tokenizer for sentence splitting
import nltk.data


# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [None]:
sentences = []  # Initialize an empty list of sentences

for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

In [None]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

In [16]:
model.doesnt_match("man woman child children".split())

  """Entry point for launching an IPython kernel.


'children'

In [17]:
model.doesnt_match("paris london germany berlin".split())

  """Entry point for launching an IPython kernel.


'germany'

In [18]:
model.doesnt_match("paris berlin london austria".split())

  """Entry point for launching an IPython kernel.


'paris'

In [15]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.7620301842689514),
 ('horrible', 0.7423584461212158),
 ('atrocious', 0.7201402187347412),
 ('abysmal', 0.6973247528076172),
 ('dreadful', 0.6787852048873901),
 ('horrendous', 0.6770411133766174),
 ('appalling', 0.6730390191078186),
 ('horrid', 0.6381957530975342),
 ('lousy', 0.6321741342544556),
 ('amateurish', 0.619769811630249)]

In [20]:
model['king']

  """Entry point for launching an IPython kernel.


array([-0.01909495,  0.13282436, -0.02914871,  0.02843413,  0.06585496,
        0.02988585,  0.01870233, -0.07107653, -0.01863946, -0.01044889,
       -0.02428217, -0.0400607 , -0.02384896, -0.03962354, -0.03405902,
       -0.0258411 , -0.00334333,  0.05864729,  0.00109282,  0.08652201,
       -0.01850603, -0.05783879,  0.03366202, -0.07850429, -0.01775524,
        0.02200716,  0.07836562, -0.01178716,  0.0265398 , -0.09631009,
        0.0282292 , -0.05130303, -0.04330629,  0.01755002, -0.07758372,
        0.03575379, -0.03498399, -0.00714112, -0.01436149, -0.0298424 ,
       -0.0798012 , -0.01896616, -0.03894292,  0.02342592, -0.03187858,
       -0.01620878, -0.06782202,  0.08301659, -0.00839741,  0.03056633,
        0.02305232, -0.03868194, -0.00081288, -0.08723932,  0.06564996,
       -0.00339666,  0.07917542, -0.10960374, -0.03831628,  0.06374227,
        0.05403523,  0.05580298,  0.00472564, -0.02740202,  0.01157806,
       -0.18899375,  0.06230469,  0.03150084,  0.08663816,  0.02