In [13]:
import pandas as pd
import os
from nltk.corpus import stopwords
import nltk.data
import logging
import numpy as np  # Make sure that numpy is imported
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier

In [14]:
import sys
sys.path.append('/home/sarahwie/Documents/DeepLearningMovies_datasets/')
from KaggleWord2VecUtility import KaggleWord2VecUtility
DATADIR='/home/sarahwie/Documents/DeepLearningMovies_datasets/'

### This function replaces all the Kaggle word parsing/cleaning (?) :

In [17]:
# Read data from files
train = pd.read_csv( os.path.join(DATADIR, 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( os.path.join(DATADIR, "unlabeledTrainData.tsv"), header=0,  delimiter="\t", quoting=3 )

In [18]:
# Verify the number of reviews that were read (100,000 in total)
print "Read %d labeled train reviews " \
     "and %d unlabeled reviews\n" % (train["review"].size, unlabeled_train["review"].size )

Read 25000 labeled train reviews and 50000 unlabeled reviews



In [19]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [20]:
train.ix[0,]

id                                                    "5814_8"
sentiment                                                    1
review       "With all this stuff going down at the moment ...
Name: 0, dtype: object

### Don't know where this came from. Gets an even split train/test set from our labelled training set

OH because Kaggle's test set has no labels so we need our own labelled test set.

In [21]:
fracTrain = 0.5
nSamples = train.shape[0]
order = np.random.permutation(nSamples) # come up with a random ordering
splitIndex = int(np.round(nSamples*fracTrain))
train1 = train.ix[order[:splitIndex],:]
test1 = train.ix[order[splitIndex:],:]

In [22]:
print train1.shape
print test1.shape

(12500, 3)
(12500, 3)


# Word2Vec inversion

Here is some stuff to split the labelled training set into positive and negative reviews. NOTE: takes a long time to run.

Gets a list of positive review indexes and negative review indexes. Takes those respective reviews and parses them into sentences and appends to a list of positive and negative sentences, respectively. Also does the same for the unlabelled training sentences.

In [23]:
# ****** Split the labeled and unlabeled training sets into clean sentences
#
sentences_pos = []  # Initialize an empty list of sentences
sentences_neg = []  # Initialize an empty list of sentences
sentences_unlabelled = []  # Initialize an empty list of sentences

inxs_pos = np.where(train1["sentiment"] == 1)[0].tolist()
inxs_neg = np.where(train1["sentiment"] == 0)[0].tolist()

print "Parsing sentences from training set"
for inx in inxs_pos:
    review = train1["review"].iloc[inx]
    sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

for inx in inxs_neg:
    review = train1["review"].iloc[inx]
    sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)    
    
print "Parsing sentences from unlabeled set"
for review in unlabeled_train["review"]:
    sentences_unlabelled += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

Parsing sentences from training set
Parsing sentences from unlabeled set


Now do the same for all sentences without splitting to build a vocabulary:

In [24]:
# ****** Split the labeled and unlabeled training sets into clean sentences
#
sentences = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for review in train1["review"]:
    sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

print "Parsing sentences from unlabeled set"
for review in unlabeled_train["review"]:
    sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

Parsing sentences from training set
Parsing sentences from unlabeled set


In [26]:
print sentences_pos[0:1]

[[u'the', u'man', u'in', u'the', u'white', u'suit', u'is', u'one', u'of', u'those', u'delightful', u'comedies', u'that', u'ealing', u'studies', u'made', u'so', u'well', u'in', u'the', u's', u'and', u's']]


Some stuff from Taddy's code (much simpler than Kaggle method). 

In [27]:
from gensim.models import Word2Vec
import multiprocessing

## create a w2v learner 
basemodel = Word2Vec(
    workers=multiprocessing.cpu_count(), # use your cores
    iter=3, # iter = sweeps of SGD through the data; more is better
    hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
    )
print basemodel

Word2Vec(vocab=0, size=100, alpha=0.025)


In [28]:
basemodel.build_vocab(sentences) 

Train 2 models off of the base model (one pos & one neg).

In [32]:
from copy import deepcopy
models = [deepcopy(basemodel) for i in range(2)]
models[0].train(sentences_pos, total_examples=len(sentences_pos) )

3296704

In [33]:
models[1].train(sentences_neg, total_examples=len(sentences_neg) )

3272151

In [34]:
print models

[<gensim.models.word2vec.Word2Vec object at 0x7f73dcd504d0>, <gensim.models.word2vec.Word2Vec object at 0x7f73dc77dad0>]


### Inversion of the distributed representations

At this point, we have 5 different word2vec language representations. Each 'model' has been trained conditional (i.e., limited to) text from a specific star rating. We will apply Bayes rule to go from p(text|stars) to p(stars|text).
For any new sentence we can obtain its likelihood (lhd; actually, the composite likelihood approximation; see the paper) using the score function in the word2vec class. We get the likelihood for each sentence in the first test review, then convert to a probability over star ratings. Every sentence in the review is evaluated separately and the final star rating of the review is an average vote of all the sentences. This is all in the following handy wrapper.

In [24]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.

***HERE is where I need to modify how the class probabilities are calculated. 
"""

import pandas as pd # for quick summing within doc

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    #***** so HERE is where I need to add the thing.
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob


def docprobmax(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    #***** so HERE is where I need to add the thing.
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").max()
    return prob

def docprobmedian(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    #***** so HERE is where I need to add the thing.
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").median()
    return prob

def docprobmin(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    #***** so HERE is where I need to add the thing.
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").min()
    return prob

#Doesn't work
def docprobmode(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    #***** so HERE is where I need to add the thing.
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = pd.mode(prob.groupby("doc"))
    return prob

In [71]:
probs = docprob(sentences_pos[0:1],models)

In [72]:
probs

Unnamed: 0_level_0,0,1
doc,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.448681,0.551319


In [67]:
#first is pos, second is neg
models

[<gensim.models.word2vec.Word2Vec at 0x7f362f60bf10>,
 <gensim.models.word2vec.Word2Vec at 0x7f36b212c710>]

### Test set example
As an example, we apply the inversion on the full test set.

In [145]:
# read in the test set as a list of a list of words
docs = []
for review in test1["review"]:
    docs.append(KaggleWord2VecUtility.review_to_sentences(review, tokenizer))

In [93]:
print docs[0]

In [26]:
# get the probs (note we give docprob our test set plus the models)
probs = docprob(docs,models)
probsmedian = docprobmedian(docs, models)
#probsmode = docprobmode(docs[0:2], models)

In [121]:
probsmedian

Unnamed: 0_level_0,0,1
doc,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.423639,0.576361
1,0.979323,0.020677


In [122]:
probs

Unnamed: 0_level_0,0,1
doc,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.471955,0.528045
1,0.748437,0.251563


### Note: this code below needs to be modified in order to plot.*

In [None]:
%matplotlib inline

probpos = pd.DataFrame({"out-of-sample prob positive":probs[[3,4]].sum(axis=1), 
                        "true stars":[r['y'] for r in revtest]})
probpos.boxplot("out-of-sample prob positive",by="true stars", figsize=(12,5))

A numpy array of ones for the number of docs that we're predicting:

In [27]:
predictions = np.ones((probs.shape[0]))
predsmedian = np.ones((probsmedian.shape[0]))

In [29]:
print np.shape(predictions)
print np.shape(predsmedian)

(12500,)
(12500,)


 Have to change this line as well for our min/max: must get largest

In [40]:
predictions[np.where(probs.iloc[:,1] > 0.5)] = 0 # The second column is actually the negative model

In [41]:
predsmedian[np.where(probsmedian.iloc[:,1] > 0.5)] = 0

In [34]:
predictions

array([ 0.,  0.,  0., ...,  1.,  0.,  1.])

In [35]:
predsmedian

array([ 0.,  0.,  0., ...,  1.,  0.,  1.])

In [36]:
print predictions.shape
print predsmedian.shape
print test1["sentiment"].shape

(12500,)
(12500,)
(12500,)


In [43]:
print np.size(np.where(predictions == test1["sentiment"]))*1./np.size(predictions)
print np.size(np.where(predsmedian == test1["sentiment"]))*1./np.size(predsmedian)

0.86968
0.8532


Now repeat measuring performance for min and max methods:

In [52]:
probsmax = docprobmax(docs, models)
probsmin = docprobmin(docs, models)

In [53]:
predsmin = np.ones((probsmin.shape[0]))
predsmax = np.ones((probsmax.shape[0]))

In [54]:
#want the largest number for both
predsmin[np.where(probsmin.iloc[:,1] > probsmin.iloc[:,0])] = 0
predsmax[np.where(probsmax.iloc[:,1] > probsmax.iloc[:,0])] = 0

In [56]:
print predsmax.shape
print predsmin.shape
print test1["sentiment"].shape

(12500,)
(12500,)
(12500,)


Note: identical. Mean method still tops.

In [57]:
print np.size(np.where(predsmin == test1["sentiment"]))*1./np.size(predsmin)
print np.size(np.where(predsmax == test1["sentiment"]))*1./np.size(predsmax)

0.85336
0.85336


### MaxDiff/MinDiff method:

In [211]:
#Define function

def docprobmaxdiff(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    #add a diff column
    prob["diff"] = abs(prob[0]-prob[1])
    #now we get the rows with the max diff for each doc
    #prob = prob.groupby("doc").apply(lambda x: x[x["diff"] == x["diff"].idxmax()])
    prob = prob.iloc[prob.groupby(["doc"])["diff"].idxmax()]
    return prob

Testing on subset:

In [210]:
probs = docprobmaxdiff(docs[0:2],models)

           0         1  doc      diff
0   0.597940  0.402060    0  0.195880
1   0.174792  0.825208    0  0.650415
2   0.111549  0.888451    0  0.776901
3   0.090950  0.909050    0  0.818099
4   0.004859  0.995141    0  0.990282
5   0.158994  0.841006    0  0.682012
6   0.594746  0.405254    0  0.189492
7   0.451735  0.548265    0  0.096531
8   0.263326  0.736674    0  0.473349
9   0.013569  0.986431    0  0.972862
10  0.109457  0.890543    0  0.781086
11  0.999513  0.000487    0  0.999027
12  0.002629  0.997371    1  0.994742
13  0.004280  0.995720    1  0.991440
14  0.246624  0.753376    1  0.506752
15  0.257214  0.742786    1  0.485573
16  0.136431  0.863569    1  0.727138
17  0.193509  0.806491    1  0.612982
18  0.000010  0.999990    1  0.999980
           0         1  doc      diff
11  0.999513  0.000487    0  0.999027
18  0.000010  0.999990    1  0.999980


In [208]:
probs = docprobmaxdiff(docs,models)

12500


In [174]:
print probs

               0         1  doc      diff
doc                                      
0   11  0.999513  0.000487    0  0.999027
1   18  0.000010  0.999990    1  0.999980


In [132]:
preds = np.ones((probs.shape[0]))

In [135]:
preds[np.where(probs.iloc[:,1] > probs.iloc[:,0])] = 0

In [138]:
print preds

[ 1.  0.  0.]


The real deal:

In [212]:
probsmaxdiff = docprobmaxdiff(docs, models)

In [213]:
predsmaxdiff = np.ones((probsmaxdiff.shape[0]))

In [214]:
#Need to change this line
predsmaxdiff[np.where(probsmaxdiff.iloc[:,1] > probsmaxdiff.iloc[:,0])] = 0

In [215]:
print predsmaxdiff.shape
print test1["sentiment"].shape

(12500,)
(12500,)


In [216]:
print np.size(np.where(predsmaxdiff == test1["sentiment"]))*1./np.size(predsmaxdiff)

0.85336
