In [1]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Populating the interactive namespace from numpy and matplotlib


In [2]:
unlabeled_train = pd.read_csv('../../Dataset/Bag Of Words/unlabeledTrainData.tsv',delimiter='\t',header=0,quoting=3)
labeled_train = pd.read_csv('../../Dataset/Bag Of Words/labeledTrainData.tsv',delimiter='\t',header=0,quoting=3)
test = pd.read_csv('../../Dataset/Bag Of Words/testData.tsv',delimiter='\t',header=0,quoting=3)

In [3]:
unlabeled_train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [31]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
def convert_to_wordlist(review,remove_stopwords = False):
    ## Removing the HTML
    clean_review = BeautifulSoup(review).get_text()
    ## Removing Non Letters
    clean_review = re.sub('[^a-zA-Z]'," ",clean_review)
    ## Converting to lowercase
    clean_review = clean_review.lower()
    ## Removing stopwords (if required)
    if remove_stopwords:
        words = [word for word in clean_review.split() if word not in stopwords.words("english")]
    else:
        words = clean_review.split()
    return words

In [5]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def convert_to_sentlist(corpus,remove_stopwords=False):
    sentlist = []
    tokens = tokenizer.tokenize(corpus.strip())
    for token in tokens:
        sentlist.append(convert_to_wordlist(token,remove_stopwords))
    return sentlist

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
i,sentences = 0,[]
print 'Parsing and Converting reviews to required format..'
for review in unlabeled_train["review"]:
    sentences += convert_to_sentlist(review.decode('utf-8'))
    if i % 1000 == 0:
        print '{} reviews parsed...'.format(i)
    i += 1

Parsing and Converting reviews to required format..
0 reviews parsed...
1000 reviews parsed...
2000 reviews parsed...
3000 reviews parsed...
4000 reviews parsed...
5000 reviews parsed...
6000 reviews parsed...
7000 reviews parsed...
8000 reviews parsed...
9000 reviews parsed...
10000 reviews parsed...
11000 reviews parsed...
12000 reviews parsed...
13000 reviews parsed...
14000 reviews parsed...
15000 reviews parsed...
16000 reviews parsed...
17000 reviews parsed...
18000 reviews parsed...
19000 reviews parsed...
20000 reviews parsed...
21000 reviews parsed...
22000 reviews parsed...
23000 reviews parsed...
24000 reviews parsed...
25000 reviews parsed...
26000 reviews parsed...
27000 reviews parsed...
28000 reviews parsed...
29000 reviews parsed...
30000 reviews parsed...
31000 reviews parsed...
32000 reviews parsed...
33000 reviews parsed...
34000 reviews parsed...
35000 reviews parsed...
36000 reviews parsed...
37000 reviews parsed...
38000 reviews parsed...
39000 reviews parsed...
4

In [8]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

In [9]:
num_features = 300                        
min_word_count = 40                        
num_workers = 4 
context = 10                                                                                    
downsampling = 1e-3 

In [10]:
from gensim.models import word2vec

In [12]:
print 'Training Model....'
model = word2vec.Word2Vec(sentences, workers=num_workers,size=num_features, min_count = min_word_count,\
                 window = context, sample = downsampling)

Training Model....


In [15]:
model_name = "Word2Vec"
model.save(model_name)

In [18]:
model.most_similar("man")

[(u'woman', 0.6530873775482178),
 (u'lady', 0.6034382581710815),
 (u'lad', 0.5575274229049683),
 (u'person', 0.5470916628837585),
 (u'men', 0.5238733291625977),
 (u'guy', 0.5228787064552307),
 (u'doctor', 0.5218369960784912),
 (u'boy', 0.5167871713638306),
 (u'chap', 0.5134159326553345),
 (u'soldier', 0.5076248645782471)]

In [21]:
model.most_similar("queen")

[(u'princess', 0.6886675357818604),
 (u'belle', 0.6561340093612671),
 (u'bride', 0.6472404599189758),
 (u'maid', 0.6402115225791931),
 (u'rose', 0.6379253268241882),
 (u'eva', 0.6294335126876831),
 (u'catherine', 0.6195963621139526),
 (u'goddess', 0.6193024516105652),
 (u'regina', 0.6125284433364868),
 (u'maria', 0.6090396642684937)]

In [25]:
## Syn0 contains each of the vector representation of the words in the vocabulary
print type(model.syn0)
print model.syn0.shape

<type 'numpy.ndarray'>
(13056, 300)


In [27]:
## Accessing the word vector for man
print model["man"].shape

(300,)


In [34]:
def makeFeatureVec(model,words,num_features=300):
    index2word_set = set(model.index2word)
    featureVec = np.zeros((num_features,))
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,n_words)
    return featureVec

In [36]:
words = convert_to_wordlist(unlabeled_train['review'][0],remove_stopwords=True)
makeFeatureVec(model,words).shape

(300,)

In [40]:
i,training_data = 0,np.zeros((len(labeled_train["review"]),num_features),dtype="float32")
for review in labeled_train["review"]:
    words = convert_to_wordlist(review,remove_stopwords=True)
    training_data[i] = makeFeatureVec(model,words)
    i += 1
    if i%1000 == 0:
        print '{} reviews parsed....'.format(i)

1000 reviews parsed....
2000 reviews parsed....
3000 reviews parsed....
4000 reviews parsed....
5000 reviews parsed....
6000 reviews parsed....
7000 reviews parsed....
8000 reviews parsed....
9000 reviews parsed....
10000 reviews parsed....
11000 reviews parsed....
12000 reviews parsed....
13000 reviews parsed....
14000 reviews parsed....
15000 reviews parsed....
16000 reviews parsed....
17000 reviews parsed....
18000 reviews parsed....
19000 reviews parsed....
20000 reviews parsed....
21000 reviews parsed....
22000 reviews parsed....
23000 reviews parsed....
24000 reviews parsed....
25000 reviews parsed....


In [42]:
pd.DataFrame(training_data).to_csv('trainingdata.csv',sep=',')

In [43]:
## Applying Random Forest Classifier on the dataset
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 100)

In [47]:
from sklearn.cross_validation import cross_val_score
score = cross_val_score(rfc,training_data,labeled_train["sentiment"],cv=5)

In [50]:
print '5 fold CV score: {}'.format(score.mean()*100)

5 fold CV score: 82.304


### Using KMeans Clustering to find clusters of words

In [51]:
from sklearn.cluster import KMeans

In [52]:
num_clusters = training_data.shape[0]/5

In [54]:
kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(model.syn0)

In [56]:
pd.Series(clusters).to_csv('clusters.csv')