In [1]:
import cPickle as pickle
import os
import sys
import tarfile
import urllib
from os.path import isfile, isdir
from tqdm import tqdm
import glob
import csv

import math
import numpy as np
import pandas as pd
from scipy import misc
import random

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import nltk.data
from gensim.models import word2vec

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
SEED = 3

# Load dataset

In [9]:
data_root = 'data/'
try:
    os.mkdir(data_root)
except OSError:
    pass

model_root = 'model/'
try:
    os.mkdir(data_root)
except OSError:
    pass

In [3]:
with open(os.path.join(data_root,'train_x_w2v.pkl'), 'rb') as f:
    train_x_w2v = pickle.load(f)

In [4]:
#testing
print len(train_x_w2v)
print train_x_w2v[1001]

808700
[u'something', u'should', u'be', u'done', u'about', u'this', u'as', u'it', u'is', u'a', u'must', u'see', u'film', u'not', u'to', u'mention', u'the', u'good', u'it', u'could', u'have', u'done', u'for', u'our', u'tourism', u'industry']


# Train and explore word2vec model1

In [24]:
def train_w2v1():
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 40   # Minimum word count                        
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    model = word2vec.Word2Vec(train_x_w2v, 
                              workers = num_workers,
                              size = num_features, 
                              min_count = min_word_count,
                              window = context, 
                              sample = downsampling,
                              seed = SEED)
    return model

In [25]:
model_w2v1 = train_w2v1()

In [26]:
model_path = os.path.join(model_root,'model_w2v1.pkl')

if not isfile(model_path):
    with open(model_path, 'wb') as f:
        pickle.dump(model_w2v1, f)

In [33]:
print len(model_w2v1.wv.index2word)

16490


In [27]:
model_w2v1.most_similar("man")

[(u'woman', 0.6248248219490051),
 (u'lady', 0.5977152585983276),
 (u'lad', 0.5547207593917847),
 (u'millionaire', 0.5305410027503967),
 (u'guy', 0.5232149958610535),
 (u'monk', 0.5214899778366089),
 (u'men', 0.52138751745224),
 (u'person', 0.5201945304870605),
 (u'soldier', 0.5190520882606506),
 (u'chap', 0.5061797499656677)]

In [28]:
model_w2v1.most_similar("woman")

[(u'lady', 0.6905761957168579),
 (u'prostitute', 0.676874041557312),
 (u'girl', 0.674848735332489),
 (u'widow', 0.6375435590744019),
 (u'nun', 0.6270597577095032),
 (u'man', 0.6248247623443604),
 (u'housewife', 0.592847466468811),
 (u'heiress', 0.5769104957580566),
 (u'waitress', 0.5751146078109741),
 (u'whore', 0.5649911165237427)]

In [29]:
model_w2v1.most_similar("awful")

[(u'terrible', 0.7772277593612671),
 (u'atrocious', 0.7456122636795044),
 (u'horrible', 0.7306403517723083),
 (u'dreadful', 0.7120442986488342),
 (u'abysmal', 0.7037981748580933),
 (u'horrid', 0.687732458114624),
 (u'horrendous', 0.6795014142990112),
 (u'appalling', 0.6652437448501587),
 (u'amateurish', 0.6254571080207825),
 (u'laughable', 0.6153308153152466)]

# Train and explore word2vec model2

In [6]:
def train_w2v2():
    num_features = 600    # Word vector dimensionality                      
    min_word_count = 40   # Minimum word count                        
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    model = word2vec.Word2Vec(train_x_w2v, 
                              workers = num_workers,
                              size = num_features, 
                              min_count = min_word_count,
                              window = context, 
                              sample = downsampling,
                              seed = SEED)
    return model

In [7]:
model_w2v2 = train_w2v2()

In [10]:
model_path = os.path.join(model_root,'model_w2v2.pkl')

if not isfile(model_path):
    with open(model_path, 'wb') as f:
        pickle.dump(model_w2v2, f)

In [13]:
print len(model_w2v2.wv.index2word)

16490


In [22]:
model_w2v2.most_similar("man")

[(u'woman', 0.6183376312255859),
 (u'lady', 0.6040821671485901),
 (u'lad', 0.5557723045349121),
 (u'farmer', 0.524786651134491),
 (u'soldier', 0.5216670036315918),
 (u'guy', 0.514202892780304),
 (u'millionaire', 0.5115393400192261),
 (u'person', 0.5098527669906616),
 (u'men', 0.5051754713058472),
 (u'businessman', 0.5040754079818726)]

In [23]:
model_w2v2.most_similar("woman")

[(u'lady', 0.6834504008293152),
 (u'prostitute', 0.6771471500396729),
 (u'girl', 0.6627585291862488),
 (u'widow', 0.6605029106140137),
 (u'man', 0.6183376312255859),
 (u'nun', 0.6017443537712097),
 (u'waitress', 0.5932685136795044),
 (u'housewife', 0.5884047746658325),
 (u'whore', 0.5780031681060791),
 (u'nurse', 0.5773943662643433)]

In [11]:
model_w2v2.most_similar("awful")

[(u'terrible', 0.7653821706771851),
 (u'atrocious', 0.7270001173019409),
 (u'abysmal', 0.7128437757492065),
 (u'horrible', 0.7084749937057495),
 (u'dreadful', 0.7037884593009949),
 (u'horrendous', 0.6744968295097351),
 (u'horrid', 0.6570312976837158),
 (u'appalling', 0.6473177075386047),
 (u'lousy', 0.6323243379592896),
 (u'amateurish', 0.6225792169570923)]

# Train and explore word2vec model3

In [14]:
def train_w2v3():
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 40   # Minimum word count                        
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size                                                                                    
    downsampling = 1e-5   # Downsample setting for frequent words

    model = word2vec.Word2Vec(train_x_w2v, 
                              workers=num_workers,
                              size=num_features, 
                              min_count = min_word_count,
                              window = context, 
                              sample = downsampling,
                              seed = SEED)
    return model

In [15]:
model_w2v3 = train_w2v3()

In [16]:
model_path = os.path.join(model_root,'model_w2v3.pkl')

if not isfile(model_path):
    with open(model_path, 'wb') as f:
        pickle.dump(model_w2v3, f)

In [17]:
print len(model_w2v3.wv.index2word)

16490


In [18]:
model_w2v3.most_similar("awful")

[(u'terrible', 0.9600377082824707),
 (u'alright', 0.9567333459854126),
 (u'horrible', 0.9562368392944336),
 (u'horrid', 0.9512249827384949),
 (u'abysmal', 0.9423476457595825),
 (u'ok', 0.9409952163696289),
 (u'lousy', 0.9353755712509155),
 (u'expected', 0.9288095831871033),
 (u'sucks', 0.9253652095794678),
 (u'worse', 0.9225075244903564)]

In [19]:
model_w2v3.most_similar("alright")

[(u'sucks', 0.9747377634048462),
 (u'horrible', 0.9698833227157593),
 (u'worse', 0.9693022966384888),
 (u'darn', 0.9692809581756592),
 (u'ok', 0.9674944877624512),
 (u'horrid', 0.9612395763397217),
 (u'stinks', 0.9589479565620422),
 (u'awful', 0.9567333459854126),
 (u'okay', 0.9544602036476135),
 (u'mess', 0.9532283544540405)]

In [20]:
model_w2v3.most_similar("man")

[(u'doctor', 0.9495570063591003),
 (u'patient', 0.9429340362548828),
 (u'himself', 0.9423059821128845),
 (u'priest', 0.94161057472229),
 (u'meets', 0.9388492107391357),
 (u'prostitute', 0.9387885332107544),
 (u'determined', 0.9367220401763916),
 (u'convinces', 0.93588787317276),
 (u'policeman', 0.9348610639572144),
 (u'blames', 0.9339728951454163)]

In [21]:
model_w2v3.most_similar("woman")

[(u'herself', 0.9797608852386475),
 (u'lonely', 0.9628087878227234),
 (u'mother', 0.9594742059707642),
 (u'abusive', 0.9552136063575745),
 (u'daughter', 0.9513610601425171),
 (u'husband', 0.9502469301223755),
 (u'abused', 0.9495465755462646),
 (u'finds', 0.947529137134552),
 (u'blind', 0.9458128213882446),
 (u'father', 0.9452913999557495)]

# Train and explore word2vec model 4

In [31]:
def train_w2v4():
    num_features = 100    # Word vector dimensionality                      
    min_word_count = 40   # Minimum word count                        
    num_workers = 4       # Number of threads to run in parallel
    context = 5          # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words
    negative_sampling=1
    
    model = word2vec.Word2Vec(train_x_w2v, 
                              negative=negative_sampling,
                              workers=num_workers,
                              size=num_features, 
                              min_count = min_word_count,
                              window = context, 
                              sample = downsampling,
                              seed = SEED)
    return model

In [32]:
model_w2v4 = train_w2v4()

In [34]:
model_path = os.path.join(model_root,'model_w2v4.pkl')

if not isfile(model_path):
    with open(model_path, 'wb') as f:
        pickle.dump(model_w2v4, f)

In [38]:
print len(model_w2v4.wv.index2word)

16490


In [35]:
model_w2v4.most_similar("awful")

[(u'horrible', 0.815282940864563),
 (u'terrible', 0.8080891966819763),
 (u'dreadful', 0.7506495714187622),
 (u'atrocious', 0.7299242615699768),
 (u'abysmal', 0.7106142044067383),
 (u'horrendous', 0.6966500282287598),
 (u'appalling', 0.6843113899230957),
 (u'awesome', 0.674087643623352),
 (u'horrid', 0.6640075445175171),
 (u'laughable', 0.6600688695907593)]

In [37]:
model_w2v4.most_similar("man")

[(u'woman', 0.6991299390792847),
 (u'person', 0.6904463768005371),
 (u'guy', 0.6899217367172241),
 (u'boy', 0.6781793236732483),
 (u'men', 0.5987793207168579),
 (u'girl', 0.5699436664581299),
 (u'lady', 0.5672974586486816),
 (u'lad', 0.5545995235443115),
 (u'soldier', 0.5438764691352844),
 (u'sailor', 0.536298394203186)]