In [40]:
import sys
import datetime
import pandas as pd
import numpy as np
import nltk.data
from gensim.models import Word2Vec
from copy import deepcopy
from sklearn.metrics import roc_auc_score    
from sklearn.cross_validation import StratifiedKFold
import random

In [41]:
np.random.seed(10)

In [42]:
import os.path
sys.path.append(os.path.expanduser('~/code/eol_hsrl_python'))
os.environ['PYTHONHASHSEED']='10'
#is this working?? no idea

In [43]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

### Read in datafile

In [7]:
# Read data from files
df = pd.read_csv( 'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,  delimiter="\t", quoting=3 )

In [10]:
df[0:10]

Unnamed: 0,id,boolean_label,abstract
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
5,"""8196_8""",1,"""I dont know why people think this is such a b..."
6,"""7166_2""",0,"""This movie could have been very good, but com..."
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm..."
8,"""319_1""",0,"""A friend of mine bought this film for £1, and..."
9,"""8713_10""",1,"""<br /><br />This movie is full of references...."


In [9]:
df.columns = ['id', 'boolean_label', 'abstract']

In [11]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

### get training and testing split on entire dataset with CV loop. randomly sample a subset of size 200 and build a Word2Vec model on it, repeat 100 times. Append all trained models to a meta array to store.

In [16]:
print(len(df))
print(len(unlabeled_train))

25000
50000


In [12]:
y = df['boolean_label'].values

#set random state
skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=10)

cv_score = []

i = 1
for train_index, test_index in skf:

    print("ROUND", i)
    i = i + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]
    print(len(train1))
    print(len(test1))

('ROUND', 1)
20000
5000
('ROUND', 2)
20000
5000
('ROUND', 3)
20000
5000
('ROUND', 4)
20000
5000
('ROUND', 5)
20000
5000


In [39]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['boolean_label'].values

#set random state
skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=10)

cv_score = []

i = 1
for train_index, test_index in skf:
    
    if i == 1:

        print("ROUND", i)
        i = i + 1
        #use the indexes to subset the df pandas dataframe
        train1, test1 = df.iloc[train_index], df.iloc[test_index]

        #PARSE ALL TRAINING SENTENCES
        sentences = []  # Initialize an empty list of sentences
        for n in range(len(train1)):
            review = train1['abstract'].iloc[n]
            sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        print "Parsing sentences from unlabeled set"
        for review in unlabeled_train["review"]:
            sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        ## BUILD BASEMODEL ON ALL SENTS
        basemodel = Word2Vec(
            sentences=None,
            size=100, #default
            window=5, #default
            workers=1,   # don't use cores- use single worker thread for fully deterministically-reproducible run
            iter=1, # iter = sweeps of SGD through the data; more is better
            hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
        )

        basemodel.build_vocab(sentences) 

        #BUILD TEST SET
        print "Parsing test sentences" #into positive and negative groups so know true label

        docs = []
        for k in range(len(test1)):
            review = test1["abstract"].iloc[k]
            sents = KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            docs.append(sents)

        #ADD: create models array to append all models to
        modsMeta = []
        metaPreds = []
        metaProbs = []

        print "Building and training w2v models"
        #SET UP LOOP TO BUILD 100 SUB-MODELS
        for j in range(100):

            #get randomly drawn subset from training set-
            #NOTE: sampling *with* replacement
            #size of 200

            indices = random.sample(xrange(train1.shape[0]),200)
            indices = np.asarray(indices)

            #use the indexes to subset the df pandas dataframe and get the associated rows
            train_sub = train1.iloc[indices]
            #print train_sub.shape

            # Word2Vec Algorithm- only building training on randomly drawn subset

            sentencesSUB = []  # Initialize an empty list of sentences
            for l in range(len(train_sub)):
                review = train_sub['abstract'].iloc[l]
                sentencesSUB += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

            # ****** Split the training set into clean sentences
            #
            sentences_pos = []  # Initialize an empty list of sentences
            sentences_neg = []  # Initialize an empty list of sentences

            #here change to include all journal name labels of positive and negative
            inxs_pos = np.where(train_sub['boolean_label'] == 1)[0].tolist()
            inxs_neg = np.where(train_sub['boolean_label'] == 0)[0].tolist()

            for inx in inxs_pos:
                review = train_sub["abstract"].iloc[inx]
                sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

            for inx in inxs_neg:
                review = train_sub["abstract"].iloc[inx]
                sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

            #train models
            models = [deepcopy(basemodel) for y in range(2)]
            models[0].train(sentences_neg, total_examples=len(sentences_neg) )
            models[1].train(sentences_pos, total_examples=len(sentences_pos) )

            probs = docprob(docs,models).astype(object)
            metaProbs.append(probs)

            predictions = np.ones((probs.shape[0]))

            predictions[np.where(probs.iloc[:,0] > 0.5)] = 0 # The first column is the negative model

            #append 0/1 predictions to array to store
            metaPreds.append(predictions)

            #append to modsMeta array to store model
            modsMeta.append(models)

        #NEXT: EVALUATE TEST SET ON THE ENSEMBLE AND ASSIGN HIGHEST VOTED VALUE
        print "Evaluating ensemble models on test set"
        ensemblePreds = []

        print len(metaPreds)

        #for each test record,
        for m in range(len(metaPreds[0])):
            sum = 0
            #get ensemble prediction
            for arr in metaPreds:
                #sum the scores received from each model for that review
                sum = sum + arr[m]
            #take the average to see if more models gave a score of 0 or 1
            avg = sum/len(metaPreds)
            #append overall 'winning' prediction
            if avg >= 0.5:
                ensemblePreds.append(1)
            else: 
                ensemblePreds.append(0)

        ensemblePreds = np.asarray(ensemblePreds)
        print ensemblePreds

        score = roc_auc_score(test1["boolean_label"], ensemblePreds)
        print score

        #cv_score.append(score)
    
#print("-----------------------------------------------")
#print("The averaged roc_auc over 5 folds is:")
#print np.sum(cv_score, dtype=np.float64)/len(cv_score)
                        
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Parsing sentences from unlabeled set
Parsing test sentences
Building and training w2v models
Evaluating ensemble models on test set
100
[1 1 1 ..., 1 1 1]
0.5476
0:20:35


In [38]:
#change to 'nan' if want the entire array
np.set_printoptions(threshold=10)
ensemblePreds

array([1, 1, 1, ..., 1, 1, 1])

### repeat of above but this time using the probability scores sum

In [20]:
#TO COPY

In [18]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences