In [1]:
import pickle
import os
import sys
import datetime
import pandas as pd
import numpy as np
import nltk.data
from gensim.models import Word2Vec
import multiprocessing
from copy import deepcopy
import sklearn.metrics as metrics
from sklearn.cross_validation import StratifiedKFold

In [2]:
import nltk
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

### Read in datafile

In [3]:
# Read data from files
df = pd.read_csv('Reviews.csv')

In [4]:
df['Text'].loc[0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [5]:
print len(df)
print df["Text"].size

568454
568454


In [6]:
df.columns = ['Id', 'ProductId', 'UserId', "ProfileName", 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'label', 'Time', 'title', 'abstract']

In [7]:
df[0:2]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,label,Time,title,abstract
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


### Check label distribution-- almost all reviews have a score of 5...

In [8]:
#we are concerned about rating
print len(df.loc[df['label'] == 5])
print len(df.loc[df['label'] == 4])
print len(df.loc[df['label'] == 3])
print len(df.loc[df['label'] == 2])
print len(df.loc[df['label'] == 1])
print len(df.loc[df['label'] == 0])

363122
80655
42640
29769
52268
0


In [9]:
#randomly subset those with label 5 to be more similar to actual amount (*** and most importantly to reduce size of dataset)
df_5 = df.loc[df['label'] == 5]
df_5 = df_5.sample(n=50000, random_state=10)

Identical to the dataset we used in the 'building_20x_train_test_sets' to get indices! (because of the random state) This is great. Means that the indices are stratified properly.

In [10]:
print(df_5["Id"])

459791    459792
87685      87686
114816    114817
69163      69164
403978    403979
489490    489491
16699      16700
167783    167784
3031        3032
123399    123400
18678      18679
308376    308377
477631    477632
208595    208596
318447    318448
566580    566581
139461    139462
368180    368181
46761      46762
51156      51157
425272    425273
94097      94098
378813    378814
117134    117135
255187    255188
175663    175664
378800    378801
468850    468851
283333    283334
218783    218784
           ...  
360349    360350
256097    256098
100767    100768
166940    166941
210771    210772
170471    170472
176188    176189
527239    527240
471804    471805
529558    529559
275867    275868
473884    473885
421308    421309
382277    382278
551521    551522
289265    289266
298675    298676
261497    261498
187481    187482
71664      71665
509082    509083
223115    223116
538357    538358
477887    477888
376105    376106
318347    318348
105912    105913
524759    5247

In [11]:
print len(df_5)

50000


In [12]:
print len(df.loc[df['label'] != 5])
df_allElse = df.loc[df['label'] != 5]

205332


In [13]:
df = pd.concat([df_allElse, df_5])
print len(df)

255332


In [14]:
print len(df.loc[df['label'] == 5])
print len(df.loc[df['label'] == 4])
print len(df.loc[df['label'] == 3])
print len(df.loc[df['label'] == 2])
print len(df.loc[df['label'] == 1])
print len(df.loc[df['label'] == 0])

50000
80655
42640
29769
52268
0


### Read in indices from pickle file

In [15]:
trainIndices = pickle.load(open('trainIndicesAFFR.p','rb'))
testIndices = pickle.load(open('testIndicesAFFR.p','rb'))

In [16]:
print(len(trainIndices[0]))
print(len(testIndices))

5
25


### Apply Word2Vec Inversion

In [17]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

In [18]:
def buildModels(train1, test1, docs, basemodel, label):
    
    score = []
    
    #add boolean labels
    train1['boolean_label'] = 1
    for row in train1.iterrows():
        if row[1]['label'] != label:
            train1.set_value(row[0], 'boolean_label', 0)
            
    test1['boolean_label'] = 1
    for row in test1.iterrows():
        if row[1]['label'] != label:
            test1.set_value(row[0], 'boolean_label', 0)
    
    # ****** Split the training set into clean sentences
    #
    sentences_pos = []  # Initialize an empty list of sentences
    sentences_neg = []  # Initialize an empty list of sentences

    #here change to include all journal name labels of positive and negative
    inxs_pos = np.where(train1['boolean_label'] == 1)[0].tolist()
    inxs_neg = np.where(train1['boolean_label'] == 0)[0].tolist()

    for inx in inxs_pos:
        review = train1["abstract"].iloc[inx]
        sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    for inx in inxs_neg:
        review = train1["abstract"].iloc[inx]
        sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
         
    #train models
    models = [deepcopy(basemodel) for y in range(2)]
    models[0].train(sentences_neg, total_examples=len(sentences_neg) )
    models[1].train(sentences_pos, total_examples=len(sentences_pos) )
    
    probs = docprob(docs,models).astype(object)
    
    predictions = np.ones((probs.shape[0]))

    predictions[np.where(probs.iloc[:,0] > 0.5)] = 0 # The first column is the negative model
    
    score.append(metrics.roc_auc_score(test1["boolean_label"], predictions))
    
    score.append(metrics.accuracy_score(test1["boolean_label"], predictions))
    
    score.append(metrics.f1_score(test1["boolean_label"], predictions))
    
    score.append(metrics.precision_score(test1["boolean_label"], predictions))
    
    score.append(metrics.recall_score(test1["boolean_label"], predictions))
    
    print(len(score))
    
    return score

In [20]:
a = datetime.datetime.now().replace(microsecond=0)

#CROSS VALIDATION
#r=0-24, our meta-folds

ROCs = []
accuracies = []
F1_scores = []
precisions = []
recalls = []

ROCs_1 = []
accuracies_1 = []
F1_scores_1 = []
precisions_1 = []
recalls_1 = []

ROCs_2 = []
accuracies_2 = []
F1_scores_2 = []
precisions_2 = []
recalls_2 = []

ROCs_3 = []
accuracies_3 = []
F1_scores_3 = []
precisions_3 = []
recalls_3 = []

ROCs_4 = []
accuracies_4 = []
F1_scores_4 = []
precisions_4 = []
recalls_4 = []

ROCs_5 = []
accuracies_5 = []
F1_scores_5 = []
precisions_5 = []
recalls_5 = []

for r in range(len(trainIndices)):
    
    print("METAROUND", r+1)

    m = 1
    
    avgROC_all = []
    avgACC_all = []
    avgF1_all = []
    avgPREC_all = []
    avgREC_all = []

    avgROC_1 = []
    avgACC_1 = []
    avgF1_1 = []
    avgPREC_1 = []
    avgREC_1 = []
    
    avgROC_2 = []
    avgACC_2 = []
    avgF1_2 = []
    avgPREC_2 = []
    avgREC_2 = []
    
    avgROC_3 = []
    avgACC_3 = []
    avgF1_3 = []
    avgPREC_3 = []
    avgREC_3 = []
    
    avgROC_4 = []
    avgACC_4 = []
    avgF1_4 = []
    avgPREC_4 = []
    avgREC_4 = []
    
    avgROC_5 = []
    avgACC_5 = []
    avgF1_5 = []
    avgPREC_5 = []
    avgREC_5 = []
    
    #i=0-4, our 5-folds
    for i in range(len(trainIndices[r])):

        print("ROUND", m)
        m = m + 1
        #use the indexes to subset the df pandas dataframe
        train1, test1 = df.iloc[trainIndices[r][i]], df.iloc[testIndices[r][i]]

        # ****** Split the labeled and unlabeled training sets into clean sentences
        #
        sentences = []  # Initialize an empty list of sentences

        for j in range(len(train1)):
            review = train1['abstract'].iloc[j]
            sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        print "Building and training w2v models"
        ## create a w2v learner 
        basemodel = Word2Vec(
            sentences=None,
            size=100, #default
            window=5, #default
            workers=multiprocessing.cpu_count(),   # use your cores
            iter=3, # iter = sweeps of SGD through the data; more is better
            hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
        )
        basemodel.build_vocab(sentences) 

        print "Parsing test sentences"
        # read in the test set as a list of a list of words

        docs = []
        for k in range(len(test1)):
            review = test1["abstract"].iloc[k]
            sents = KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            docs.append(sents)

        print "scoring test set"
        tup1 = buildModels(train1, test1, docs, basemodel, 1)
        
        avgROC_1.append(tup1[0])
        avgACC_1.append(tup1[1])
        avgF1_1.append(tup1[2])
        avgPREC_1.append(tup1[3])
        avgREC_1.append(tup1[4])
        
        avgROC_all.append(tup1[0])
        avgACC_all.append(tup1[1])
        avgF1_all.append(tup1[2])
        avgPREC_all.append(tup1[3])
        avgREC_all.append(tup1[4])

        tup2 = buildModels(train1, test1, docs, basemodel, 2)
        
        avgROC_2.append(tup2[0])
        avgACC_2.append(tup2[1])
        avgF1_2.append(tup2[2])
        avgPREC_2.append(tup2[3])
        avgREC_2.append(tup2[4])
        
        avgROC_all.append(tup2[0])
        avgACC_all.append(tup2[1])
        avgF1_all.append(tup2[2])
        avgPREC_all.append(tup2[3])
        avgREC_all.append(tup2[4])

        tup3 = buildModels(train1, test1, docs, basemodel, 3)

        avgROC_3.append(tup3[0])
        avgACC_3.append(tup3[1])
        avgF1_3.append(tup3[2])
        avgPREC_3.append(tup3[3])
        avgREC_3.append(tup3[4])
        
        avgROC_all.append(tup3[0])
        avgACC_all.append(tup3[1])
        avgF1_all.append(tup3[2])
        avgPREC_all.append(tup3[3])
        avgREC_all.append(tup3[4])

        tup4 = buildModels(train1, test1, docs, basemodel, 4)
        
        avgROC_4.append(tup4[0])
        avgACC_4.append(tup4[1])
        avgF1_4.append(tup4[2])
        avgPREC_4.append(tup4[3])
        avgREC_4.append(tup4[4])
        
        avgROC_all.append(tup4[0])
        avgACC_all.append(tup4[1])
        avgF1_all.append(tup4[2])
        avgPREC_all.append(tup4[3])
        avgREC_all.append(tup4[4])

        tup5 = buildModels(train1, test1, docs, basemodel, 5)

        avgROC_5.append(tup5[0])
        avgACC_5.append(tup5[1])
        avgF1_5.append(tup5[2])
        avgPREC_5.append(tup5[3])
        avgREC_5.append(tup5[4])
        
        avgROC_all.append(tup5[0])
        avgACC_all.append(tup5[1])
        avgF1_all.append(tup5[2])
        avgPREC_all.append(tup5[3])
        avgREC_all.append(tup5[4])


#     print("average of Score 1 model:", sum(avg1)/float(len(avg1)))
#     print("average of Score 2 model:", sum(avg2)/float(len(avg2)))
#     print("average of Score 3 model:", sum(avg3)/float(len(avg3)))
#     print("average of Score 4 model:", sum(avg4)/float(len(avg4)))
#     print("average of Score 5 model:", sum(avg5)/float(len(avg5)))

    print("average ROC across all models:", sum(avgROC_all)/float(len(avgROC_all)))
    print("average ACC across all models:", sum(avgACC_all)/float(len(avgACC_all)))
    print("average F1 across all models:", sum(avgF1_all)/float(len(avgF1_all)))
    print("average PREC across all models:", sum(avgPREC_all)/float(len(avgPREC_all)))
    print("average REC across all models:", sum(avgREC_all)/float(len(avgREC_all)))
    
    ROCs.append(sum(avgROC_all)/float(len(avgROC_all)))
    accuracies.append(sum(avgACC_all)/float(len(avgACC_all)))
    F1_scores.append(sum(avgF1_all)/float(len(avgF1_all)))
    precisions.append(sum(avgPREC_all)/float(len(avgPREC_all)))
    recalls.append(sum(avgREC_all)/float(len(avgREC_all)))
                     
    ROCs_1.append(sum(avgROC_1)/float(len(avgROC_1)))
    accuracies_1.append(sum(avgACC_1)/float(len(avgACC_1)))
    F1_scores_1.append(sum(avgF1_1)/float(len(avgF1_1)))
    precisions_1.append(sum(avgPREC_1)/float(len(avgPREC_1)))
    recalls_1.append(sum(avgREC_1)/float(len(avgREC_1)))
                                      
    ROCs_2.append(sum(avgROC_2)/float(len(avgROC_2)))
    accuracies_2.append(sum(avgACC_2)/float(len(avgACC_2)))
    F1_scores_2.append(sum(avgF1_2)/float(len(avgF1_2)))
    precisions_2.append(sum(avgPREC_2)/float(len(avgPREC_2)))
    recalls_2.append(sum(avgREC_2)/float(len(avgREC_2)))
                     
    ROCs_3.append(sum(avgROC_3)/float(len(avgROC_3)))
    accuracies_3.append(sum(avgACC_3)/float(len(avgACC_3)))
    F1_scores_3.append(sum(avgF1_3)/float(len(avgF1_3)))
    precisions_3.append(sum(avgPREC_3)/float(len(avgPREC_3)))
    recalls_3.append(sum(avgREC_3)/float(len(avgREC_3)))
                     
    ROCs_4.append(sum(avgROC_4)/float(len(avgROC_4)))
    accuracies_4.append(sum(avgACC_4)/float(len(avgACC_4)))
    F1_scores_4.append(sum(avgF1_4)/float(len(avgF1_4)))
    precisions_4.append(sum(avgPREC_4)/float(len(avgPREC_4)))
    recalls_4.append(sum(avgREC_4)/float(len(avgREC_4)))
                     
    ROCs_5.append(sum(avgROC_5)/float(len(avgROC_5)))
    accuracies_5.append(sum(avgACC_5)/float(len(avgACC_5)))
    F1_scores_5.append(sum(avgF1_5)/float(len(avgF1_5)))
    precisions_5.append(sum(avgPREC_5)/float(len(avgPREC_5)))
    recalls_5.append(sum(avgREC_5)/float(len(avgREC_5)))
    
    print("--------------------------------------------------")

print("------------------------------------------------------")
print("FINAL RESULTS:")
print("ROCs:")
print(ROCs)
print("accuracies:")
print(accuracies)
print("F1-scores:")
print(F1_scores)
print("precisions:")
print(precisions)
print("recalls:")
print(recalls)

print("FINAL RESULTS FOR CLASS 1:")
print("ROCs:")
print(ROCs_1)
print("accuracies:")
print(accuracies_1)
print("F1-scores:")
print(F1_scores_1)
print("precisions:")
print(precisions_1)
print("recalls:")
print(recalls_1)

print("FINAL RESULTS FOR CLASS 2:")
print("ROCs:")
print(ROCs_2)
print("accuracies:")
print(accuracies_2)
print("F1-scores:")
print(F1_scores_2)
print("precisions:")
print(precisions_2)
print("recalls:")
print(recalls_2)

print("FINAL RESULTS FOR CLASS 3:")
print("ROCs:")
print(ROCs_3)
print("accuracies:")
print(accuracies_3)
print("F1-scores:")
print(F1_scores_3)
print("precisions:")
print(precisions_3)
print("recalls:")
print(recalls_3)

print("FINAL RESULTS FOR CLASS 4:")
print("ROCs:")
print(ROCs_4)
print("accuracies:")
print(accuracies_4)
print("F1-scores:")
print(F1_scores_4)
print("precisions:")
print(precisions_4)
print("recalls:")
print(recalls_4)

print("FINAL RESULTS FOR CLASS 5:")
print("ROCs:")
print(ROCs_5)
print("accuracies:")
print(accuracies_5)
print("F1-scores:")
print(F1_scores_5)
print("precisions:")
print(precisions_5)
print("recalls:")
print(recalls_5)
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('METAROUND', 1)
('ROUND', 1)




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Building and training w2v models
Parsing test sentences
scoring test set


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


5
5
5
5
5
('ROUND', 2)
Building and training w2v models
Parsing test sentences
scoring test set
5
5
5
5
5
('ROUND', 3)
Building and training w2v models
Parsing test sentences
scoring test set
5
5
5
5
5
('ROUND', 4)
Building and training w2v models
Parsing test sentences
scoring test set
5
5
5
5
5
('ROUND', 5)
Building and training w2v models
Parsing test sentences
scoring test set
5
5
5
5
5
('average ROC across all models:', 0.73049814334535978)
('average ACC across all models:', 0.83774066478898834)
('average F1 across all models:', 0.5708536235784355)
('average PREC across all models:', 0.57707209030671713)
('average REC across all models:', 0.56711197502079314)
--------------------------------------------------
('METAROUND', 2)
('ROUND', 1)
Building and training w2v models
Parsing test sentences
scoring test set
5
5
5
5
5
('ROUND', 2)
Building and training w2v models
Parsing test sentences
scoring test set
5
5
5
5
5
('ROUND', 3)
Building and training w2v models
Parsing test sentence

### Getting confidence intervals...I think this is the way to do it? Get feedback

In [21]:
import scipy.stats as st

print(" Accuracy")
print(st.t.interval(0.95, len(accuracies)-1, loc=np.mean(accuracies), scale=st.sem(accuracies)))
print("ROC")
print(st.t.interval(0.95, len(ROCs)-1, loc=np.mean(ROCs), scale=st.sem(ROCs)))
print("F1 score")
print(st.t.interval(0.95, len(F1_scores)-1, loc=np.mean(F1_scores), scale=st.sem(F1_scores)))
print("Precision")
print(st.t.interval(0.95, len(precisions)-1, loc=np.mean(precisions), scale=st.sem(precisions)))
print("Recall")
print(st.t.interval(0.95, len(recalls)-1, loc=np.mean(recalls), scale=st.sem(recalls)))

 Accuracy
(0.83672414799015027, 0.83708124924899641)
ROC
(0.73087622946727948, 0.73147195848685009)
F1 score
(0.57083998677905889, 0.57144670147916765)
Precision
(0.57399353515929774, 0.57506294180410844)
Recall
(0.56893575578603683, 0.57080132330880939)


In [19]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences