In [2]:
import pickle
import numpy as np
import pandas as pd

In [3]:
test1 = pickle.load(open('test1.p','rb'))

In [4]:
docs = pickle.load(open('docs.p','rb'))

In [5]:
models = pickle.load(open('models.p','rb'))

### Method:

In [24]:
#Define function

def docprobscoresentences(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob["score"] = 1
    inxs = np.where(prob.iloc[:,0] > 0.5)[0]
    prob.loc[inxs, 'score'] = 0
    prob = prob.groupby("doc")["score"].mean()
    return prob

Testing on subset:

In [12]:
probs = docprobscoresentences(docs[0:2],models)

In [13]:
probs

doc
0    0.50
1    0.75
Name: score, dtype: float64

The real deal:

In [26]:
probs = docprobscoresentences(docs, models)

In [27]:
print probs[0:20]

doc
0     0.500000
1     0.750000
2     0.333333
3     0.111111
4     1.000000
5     0.571429
6     0.500000
7     0.600000
8     0.111111
9     1.000000
10    0.727273
11    0.750000
12    0.750000
13    0.714286
14    0.600000
15    0.818182
16    0.250000
17    1.000000
18    0.333333
19    0.458333
Name: score, dtype: float64


In [47]:
preds = np.random.randint(0,2,size=probs.shape[0])
print preds[0:20]

[1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1]


In [45]:
preds[np.where(probs < 0.5)] = 0
preds[np.where(probs > 0.5)] = 1
#if exactly 0.5, generate a 0 or 1 randomly and assign

In [46]:
print preds[0:20]

[1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 0]


In [48]:
print preds.shape
print test1["sentiment"].shape

(12500,)
(12500,)


In [50]:
print np.size(np.where(preds == test1["sentiment"]))*1./np.size(preds)

0.49728


In [51]:
pickle.dump(preds, open('predsscoressentences.p','wb'))