## Perform on subset of movie reviews (to compare to abstracts). With cross-validation

In [2]:
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
import pickle
import os
import pandas as pd
import numpy as np
import nltk.data
from gensim.models import Word2Vec
import multiprocessing
from copy import deepcopy
import datetime
from sklearn.metrics import roc_auc_score

In [3]:
import sys
sys.path.append('/home/sarahwie/Documents/pubmed-nlp-research/DeepLearningMovies_datasets/')
from KaggleWord2VecUtility import KaggleWord2VecUtility
DATADIR='/home/sarahwie/Documents/pubmed-nlp-research/DeepLearningMovies_datasets/'

In [4]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
# Read data from files
train = pd.read_csv( os.path.join(DATADIR, 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( os.path.join(DATADIR, "unlabeledTrainData.tsv"), header=0,  delimiter="\t", quoting=3 )

### Get training and testing sets using 5-fold stratified cross-validation

In [14]:
#subset dataset to about the same size as the journal sample we were using
df = train.sample(422)

In [10]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

import pandas as pd # for quick summing within doc

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

In [15]:
a = datetime.datetime.now().replace(microsecond=0)

#5-fold stratified cross validation
from sklearn.cross_validation import StratifiedKFold
import numpy as np

#because no validation set, 4/5 of values go to train and 1/5 to test
#is this too high?***
#even though we shuffle, not as randomly distributed as the former method was
y = df['sentiment'].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True)

#get all unlabelled sentences (doesn't change through each loop of CV)
sentences_unlabelled = []  # Initialize an empty list of sentences
print "Parsing sentences from unlabeled set"
for review in unlabeled_train["review"]:
    sentences_unlabelled += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

i = 1
avg = []
for train_index, test_index in skf:
    print("ROUND", i)
    i = i + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]
    
    # ****** Split the training set into clean sentences
    #
    sentences_pos = []  # Initialize an empty list of sentences
    sentences_neg = []  # Initialize an empty list of sentences

    #here change to include all journal name labels of positive and negative
    inxs_pos = np.where(train1['sentiment'] == 1)[0].tolist()
    inxs_neg = np.where(train1['sentiment'] == 0)[0].tolist()

    print "Parsing sentences from training set"
    for inx in inxs_pos:
        review = train1["review"].iloc[inx]
        sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    for inx in inxs_neg:
        review = train1["review"].iloc[inx]
        sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer) 
        
    # ****** Split the labeled and unlabeled training sets into clean sentences
    #
    sentences = []  # Initialize an empty list of sentences

    print "Parsing sentences from training set"
    for review in train1["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    print "Parsing sentences from unlabeled set"
    for review in unlabeled_train["review"]:
        sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
    
    print "Building and training w2v models"
    ## create a w2v learner 
    basemodel = Word2Vec(
        workers=multiprocessing.cpu_count(), # use your cores
        iter=3, # iter = sweeps of SGD through the data; more is better
        hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
        )
    basemodel.build_vocab(sentences) 
    
    #train models
    models = [deepcopy(basemodel) for y in range(2)]
    models[0].train(sentences_neg, total_examples=len(sentences_neg) )
    models[1].train(sentences_pos, total_examples=len(sentences_pos) )
    
    print "Parsing test sentences"
    # read in the test set as a list of a list of words
    docs = []
    for review in test1["review"]:
        docs.append(KaggleWord2VecUtility.review_to_sentences(review, tokenizer))
    
    print "scoring test set"
    # get the probs (note we give docprob our test set plus the models)
    probs = docprob(docs,models).astype(object)
    
    predictions = np.ones((probs.shape[0]))
    
    predictions[np.where(probs.iloc[:,0] > 0.5)] = 0 # The first column is the negative model
    print predictions
    
    score = roc_auc_score(test1["sentiment"], predictions)
    #score = np.size(np.where(predictions == test1["sentiment"]))*1./np.size(predictions)
    print score
    #append to average
    avg.append(score)


print("average of 5 rotations:", sum(avg)/float(len(avg)))
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

Parsing sentences from unlabeled set
('ROUND', 1)
Parsing sentences from training set
Parsing sentences from training set
Parsing sentences from unlabeled set
Building and training w2v models
Parsing test sentences
scoring test set
[ 0.  0.  0.  0.  1.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.
  0.  1.  0.  1.  0.  1.  0.  1.  0.  0.  1.  1.  0.  1.  0.  0.  1.  0.
  1.  0.  0.  0.  0.  1.  1.  0.  1.  0.  1.  0.  1.  1.  1.  0.  1.  1.
  0.  1.  0.  0.  1.  1.  1.  0.  1.  0.  0.  0.  1.  0.  1.  1.  0.  0.
  1.  0.  1.  1.  0.  1.  0.  1.  1.  1.  1.  0.  1.]
0.657982261641
('ROUND', 2)
Parsing sentences from training set
Parsing sentences from training set
Parsing sentences from unlabeled set
Building and training w2v models
Parsing test sentences
scoring test set
[ 0.  1.  1.  0.  1.  1.  1.  1.  0.  0.  0.  0.  1.  1.  1.  0.  1.  0.
  1.  0.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  0.  0.  1.  0.  1.  1.
  0.  1.  1.  1.  0.  1.  1.  0.  1.  0.  1.  0.  1.  0.  1.  1.  