In [4]:
import pickle
import os
import sys
import datetime
import pandas as pd
import numpy as np
import nltk.data
from gensim.models import Word2Vec
import multiprocessing
from copy import deepcopy
from sklearn.metrics import roc_auc_score    
from sklearn.cross_validation import StratifiedKFold

In [5]:
np.random.seed(10)

In [7]:
import os.path
sys.path.append(os.path.expanduser('~/code/eol_hsrl_python'))
os.environ['PYTHONHASHSEED']='10'
#is this working?? appears to be

In [8]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

### Read in datafile

In [9]:
# Read data from files
df = pd.read_csv( 'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,  delimiter="\t", quoting=3 )

In [10]:
df[0:2]

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."


In [11]:
df['review'].loc[0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [12]:
print len(df)
print df["review"].size

25000
25000


In [13]:
df.columns = ['id', 'boolean_label', 'abstract']

In [14]:
df[0:2]

Unnamed: 0,id,boolean_label,abstract
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."


### Check label distribution-- almost all reviews have a score of 5...

In [15]:
#we are concerned about rating
print len(df.loc[df['boolean_label'] == 1])
print len(df.loc[df['boolean_label'] == 0])

12500
12500


### Apply Word2Vec Inversion

In [16]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

In [17]:
def buildModels(train1, test1, docs, basemodel):
    
    # ****** Split the training set into clean sentences
    #
    sentences_pos = []  # Initialize an empty list of sentences
    sentences_neg = []  # Initialize an empty list of sentences

    #here change to include all journal name labels of positive and negative
    inxs_pos = np.where(train1['boolean_label'] == 1)[0].tolist()
    inxs_neg = np.where(train1['boolean_label'] == 0)[0].tolist()

    for inx in inxs_pos:
        review = train1["abstract"].iloc[inx]
        sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    for inx in inxs_neg:
        review = train1["abstract"].iloc[inx]
        sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
         
    #train models
    models = [deepcopy(basemodel) for y in range(2)]
    models[0].train(sentences_neg, total_examples=len(sentences_neg) )
    models[1].train(sentences_pos, total_examples=len(sentences_pos) )
    
    probs = docprob(docs,models).astype(object)
    
    predictions = np.ones((probs.shape[0]))

    predictions[np.where(probs.iloc[:,0] > 0.5)] = 0 # The first column is the negative model

    score = roc_auc_score(test1["boolean_label"], predictions)
    
    return score

In [18]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['boolean_label'].values

#set random state
skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=10)

i = 1
avg = []
for train_index, test_index in skf:

    if i == 1:
        print("ROUND", i)
        i = i + 1
        #use the indexes to subset the df pandas dataframe
        train1, test1 = df.iloc[train_index], df.iloc[test_index]


        sentences = []  # Initialize an empty list of sentences
        for j in range(len(train1)):
            review = train1['abstract'].iloc[j]
            sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        print "Parsing sentences from unlabeled set"
        for review in unlabeled_train["review"]:
            sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        print "Building and training w2v models"
        ## create a w2v learner 
    #     basemodel = Word2Vec(
    #         workers=1, # don't use cores- use single worker thread for fully deterministically-reproducible run
    #         iter=3, # iter = sweeps of SGD through the data; more is better
    #         hs=1, negative=0, # we only have scoring for the hierarchical softmax setup
    #         seed=10
    #         )

        basemodel = Word2Vec(
            sentences=None,
            size=100, #default
            window=5, #default
            workers=1,   # don't use cores- use single worker thread for fully deterministically-reproducible run
            iter=1, # iter = sweeps of SGD through the data; more is better
            hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
        )
        basemodel.build_vocab(sentences) 

        print "Parsing test sentences"
        # read in the test set as a list of a list of words

        docs = []
        for k in range(len(test1)):
            review = test1["abstract"].iloc[k]
            sents = KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            docs.append(sents)

        print "scoring test set"
        tup = buildModels(train1, test1, docs, basemodel)
        avg.append(tup)

print("average:", sum(avg)/float(len(avg)))

    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

<type 'numpy.ndarray'>
('ROUND', 1)


NameError: name 'KaggleWord2VecUtility' is not defined

In [24]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences