In [1]:
import pickle
import os
import sys
import datetime
import pandas as pd
import numpy as np
import nltk.data
from gensim.models import Word2Vec
import multiprocessing
from copy import deepcopy
import sklearn.metrics as metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [3]:
# Read data from files
df = pd.read_csv('Reviews.csv')

In [4]:
df.columns = ['Id', 'ProductId', 'UserId', "ProfileName", 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'label', 'Time', 'title', 'abstract']

In [5]:
df[0:2]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,label,Time,title,abstract
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [6]:
df_5 = df.loc[df['label'] == 5]
df_5 = df_5.sample(n=50000, random_state=10)

df_allElse = df.loc[df['label'] != 5]

df = pd.concat([df_allElse, df_5])

### Read in indices from pickle file

In [7]:
trainIndices = pickle.load(open('trainIndicesAFFR.p','rb'))
testIndices = pickle.load(open('testIndicesAFFR.p','rb'))

### Apply Word2Vec Inversion to get sentence scores

In [8]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprobsentences(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    #prob = prob.groupby("doc").mean()
    return prob

In [9]:
def getFeatures(probssentences_train, train1):
    features = np.zeros((np.unique(probssentences_train["doc"]).shape[0],11))
    label = []
    for doc in np.unique(probssentences_train["doc"]):
        inx = np.where(probssentences_train["doc"] == doc)[0]
        subset = probssentences_train[0].iloc[inx]
        score = train1['boolean_label'].iloc[doc]
        label.append(score)

        inxFirst = inx[0]
        inxLast = inx[-1]

        features[doc,0] = np.mean(subset) #mean of scores
        features[doc,1] = np.min(subset) #min score
        features[doc,2] = np.max(subset) #max score
        features[doc,3] = np.std(subset) #stdev of scores
        features[doc,4] = probssentences_train[0].iloc[inxFirst] #score of first sentence
        features[doc,5] = probssentences_train[0].iloc[inxLast] #score of last sentences
        features[doc,6] = len(subset) #number of sentences
        features[doc,7] = np.mean(                               #sigmoid (with k = 10)
                            1./(1+np.exp(-10*
                               subset.astype(float)-0.5 ))) 
        features[doc,8] = np.mean(                               #sigmoid (with k = 20)
                            1./(1+np.exp(-20*
                               subset.astype(float)-0.5 )))
        features[doc,9] = np.argmin(subset)-inxFirst #position of min score
        features[doc,10] = np.argmax(subset)-inxFirst #position of max score
    
    return (features, label)

In [10]:
def buildModels(train1, test1, docs, docsTrain, basemodel, label):
    
    scoreDT = []
    scoreRF = []
    
    #add boolean labels
    train1['boolean_label'] = 1
    for row in train1.iterrows():
        if row[1]['label'] != label:
            train1.set_value(row[0], 'boolean_label', 0)
            
    test1['boolean_label'] = 1
    for row in test1.iterrows():
        if row[1]['label'] != label:
            test1.set_value(row[0], 'boolean_label', 0)
    
    # ****** Split the labeled training set into clean sentences
    #
    sentences_pos = []  # Initialize an empty list of sentences
    sentences_neg = []  # Initialize an empty list of sentences

    inxs_pos = np.where(train1["boolean_label"] == 1)[0].tolist()
    inxs_neg = np.where(train1["boolean_label"] == 0)[0].tolist()

    print "Parsing sentences from training set"
    for inx in inxs_pos:
        review = train1["abstract"].iloc[inx]
        sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

    for inx in inxs_neg:
        review = train1["abstract"].iloc[inx]
        sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)    

    models = [deepcopy(basemodel) for y in range(2)]
    models[0].train(sentences_neg, total_examples=len(sentences_neg) )
    models[1].train(sentences_pos, total_examples=len(sentences_pos) )

    print "scoring test set"
    # get the probs (note we give docprob our test set plus the models)
    probssentencesArticle = docprobsentences(docs,models).astype(object)

    probssentences_train = docprobsentences(docsTrain,models).astype(object)
        
    #step 1- get training features
    obj = getFeatures(probssentences_train, train1)
    features = obj[0]
    label = obj[1]

    #step 2- build decision tree
    clf = tree.DecisionTreeClassifier(min_samples_leaf=1000)
    #criterion = 'entropy
    clf = clf.fit(features, label)

    print "decision tree performance on self"
    resultTrain = clf.predict(features)

    print metrics.roc_auc_score(label, resultTrain)

    #step 3- build random forest

    forest = RandomForestClassifier()
    forest = forest.fit( features, label )

    print "random forest performance on self"
    resultTrainRF = forest.predict(features)
    print metrics.roc_auc_score(label, resultTrainRF)

    #step 4- get test features
    test = getFeatures(probssentencesArticle, test1)
    featuresT = test[0]
    labelsT = test[1]

    #step 5- get test set performance on decision tree 
    result = clf.predict(featuresT)
    
    scoreDT.append(metrics.roc_auc_score(labelsT,result))
    
    scoreDT.append(metrics.accuracy_score(labelsT,result))
    
    scoreDT.append(metrics.f1_score(labelsT,result))
    
    scoreDT.append(metrics.precision_score(labelsT,result))
    
    scoreDT.append(metrics.recall_score(labelsT,result))

    #step 6- get test set performance on random forest
    clf2 = forest.predict(featuresT)
    
    scoreRF.append(metrics.roc_auc_score(labelsT,clf2))
    
    scoreRF.append(metrics.accuracy_score(labelsT,clf2))
    
    scoreRF.append(metrics.f1_score(labelsT,clf2))
    
    scoreRF.append(metrics.precision_score(labelsT,clf2))
    
    scoreRF.append(metrics.recall_score(labelsT,clf2))
    
    return scoreDT, scoreRF, models, probssentencesArticle, probssentences_train, clf, forest

In [13]:
a = datetime.datetime.now().replace(microsecond=0)

#CROSS VALIDATION
#r=0-24, our meta-folds

ROCsDT = []
accuraciesDT = []
F1_scoresDT = []
precisionsDT = []
recallsDT = []

ROCs_1DT = []
accuracies_1DT = []
F1_scores_1DT = []
precisions_1DT = []
recalls_1DT = []

ROCs_2DT = []
accuracies_2DT = []
F1_scores_2DT = []
precisions_2DT = []
recalls_2DT = []

ROCs_3DT = []
accuracies_3DT = []
F1_scores_3DT = []
precisions_3DT = []
recalls_3DT = []

ROCs_4DT = []
accuracies_4DT = []
F1_scores_4DT = []
precisions_4DT = []
recalls_4DT = []

ROCs_5DT = []
accuracies_5DT = []
F1_scores_5DT = []
precisions_5DT = []
recalls_5DT = []

ROCsRF = []
accuraciesRF = []
F1_scoresRF = []
precisionsRF = []
recallsRF = []

ROCs_1RF = []
accuracies_1RF = []
F1_scores_1RF = []
precisions_1RF = []
recalls_1RF = []

ROCs_2RF = []
accuracies_2RF = []
F1_scores_2RF = []
precisions_2RF = []
recalls_2RF = []

ROCs_3RF = []
accuracies_3RF = []
F1_scores_3RF = []
precisions_3RF = []
recalls_3RF = []

ROCs_4RF = []
accuracies_4RF = []
F1_scores_4RF = []
precisions_4RF = []
recalls_4RF = []

ROCs_5RF = []
accuracies_5RF = []
F1_scores_5RF = []
precisions_5RF = []
recalls_5RF = []

# avgDT_All = []
# avgDT_1 = []
# avgDT_2 = []
# avgDT_3 = []
# avgDT_4 = []
# avgDT_5 = []
# avgRF_All = []
# avgRF_1 = []
# avgRF_2 = []
# avgRF_3 = []
# avgRF_4 = []
# avgRF_5 = []

for r in range(len(trainIndices)):
    
    print("METAROUND", r+1)

    m = 1
    
    avgROC_allDT = []
    avgACC_allDT = []
    avgF1_allDT = []
    avgPREC_allDT = []
    avgREC_allDT = []

    avgROC_1DT = []
    avgACC_1DT = []
    avgF1_1DT = []
    avgPREC_1DT = []
    avgREC_1DT = []
    
    avgROC_2DT = []
    avgACC_2DT = []
    avgF1_2DT = []
    avgPREC_2DT = []
    avgREC_2DT = []
    
    avgROC_3DT = []
    avgACC_3DT = []
    avgF1_3DT = []
    avgPREC_3DT = []
    avgREC_3DT = []
    
    avgROC_4DT = []
    avgACC_4DT = []
    avgF1_4DT = []
    avgPREC_4DT = []
    avgREC_4DT = []
    
    avgROC_5DT = []
    avgACC_5DT = []
    avgF1_5DT = []
    avgPREC_5DT = []
    avgREC_5DT = []
    
    avgROC_allRF = []
    avgACC_allRF = []
    avgF1_allRF = []
    avgPREC_allRF = []
    avgREC_allRF = []

    avgROC_1RF = []
    avgACC_1RF = []
    avgF1_1RF = []
    avgPREC_1RF = []
    avgREC_1RF = []
    
    avgROC_2RF = []
    avgACC_2RF = []
    avgF1_2RF = []
    avgPREC_2RF = []
    avgREC_2RF = []
    
    avgROC_3RF = []
    avgACC_3RF = []
    avgF1_3RF = []
    avgPREC_3RF = []
    avgREC_3RF = []
    
    avgROC_4RF = []
    avgACC_4RF = []
    avgF1_4RF = []
    avgPREC_4RF = []
    avgREC_4RF = []
    
    avgROC_5RF = []
    avgACC_5RF = []
    avgF1_5RF = []
    avgPREC_5RF = []
    avgREC_5RF = []
    
    #i=0-4, our 5-folds
    for i in range(len(trainIndices[r])):

        print("ROUND", m)
        m = m + 1
        #use the indexes to subset the df pandas dataframe
        train1, test1 = df.iloc[trainIndices[r][i]], df.iloc[testIndices[r][i]]
    
        # ****** Split the labeled and unlabeled training sets into clean sentences
        #
        sentences = []  # Initialize an empty list of sentences
        docsTrain = []
        for review in train1["abstract"]:
            result = KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            sentences += result
            docsTrain.append(result)

        print "Building and training w2v models"
        ## create a w2v learner 
        basemodel = Word2Vec(
            workers=multiprocessing.cpu_count(), # use your cores
            iter=3, # iter = sweeps of SGD through the data; more is better
            hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
            )
        basemodel.build_vocab(sentences) 

        print "Parsing sentences from test set"
        # read in the test set as a list of a list of words
        docs = []
        for review in test1["abstract"]:
            docs.append(KaggleWord2VecUtility.review_to_sentences(review, tokenizer))

        print "Scoring test set"
        tup1 = buildModels(train1, test1, docs, docsTrain, basemodel, 1)
        score_DT = tup1[0]
        score_RF = tup1[1]
        
        avgROC_allDT.append(score_DT[0])
        avgACC_allDT.append(score_DT[1])
        avgF1_allDT.append(score_DT[2])
        avgPREC_allDT.append(score_DT[3])
        avgREC_allDT.append(score_DT[4])

        avgROC_1DT.append(score_DT[0])
        avgACC_1DT.append(score_DT[1])
        avgF1_1DT.append(score_DT[2])
        avgPREC_1DT.append(score_DT[3])
        avgREC_1DT.append(score_DT[4])

        avgROC_allRF.append(score_RF[0])
        avgACC_allRF.append(score_RF[1])
        avgF1_allRF.append(score_RF[2])
        avgPREC_allRF.append(score_RF[3])
        avgREC_allRF.append(score_RF[4])

        avgROC_1RF.append(score_RF[0])
        avgACC_1RF.append(score_RF[1])
        avgF1_1RF.append(score_RF[2])
        avgPREC_1RF.append(score_RF[3])
        avgREC_1RF.append(score_RF[4])

        tup2 = buildModels(train1, test1, docs, docsTrain, basemodel, 2)

        score_DT = tup2[0]
        score_RF = tup2[1]
        
        avgROC_allDT.append(score_DT[0])
        avgACC_allDT.append(score_DT[1])
        avgF1_allDT.append(score_DT[2])
        avgPREC_allDT.append(score_DT[3])
        avgREC_allDT.append(score_DT[4])

        avgROC_2DT.append(score_DT[0])
        avgACC_2DT.append(score_DT[1])
        avgF1_2DT.append(score_DT[2])
        avgPREC_2DT.append(score_DT[3])
        avgREC_2DT.append(score_DT[4])

        avgROC_allRF.append(score_RF[0])
        avgACC_allRF.append(score_RF[1])
        avgF1_allRF.append(score_RF[2])
        avgPREC_allRF.append(score_RF[3])
        avgREC_allRF.append(score_RF[4])

        avgROC_2RF.append(score_RF[0])
        avgACC_2RF.append(score_RF[1])
        avgF1_2RF.append(score_RF[2])
        avgPREC_2RF.append(score_RF[3])
        avgREC_2RF.append(score_RF[4])

        tup3 = buildModels(train1, test1, docs, docsTrain, basemodel, 3)

        score_DT = tup3[0]
        score_RF = tup3[1]
        
        avgROC_allDT.append(score_DT[0])
        avgACC_allDT.append(score_DT[1])
        avgF1_allDT.append(score_DT[2])
        avgPREC_allDT.append(score_DT[3])
        avgREC_allDT.append(score_DT[4])

        avgROC_3DT.append(score_DT[0])
        avgACC_3DT.append(score_DT[1])
        avgF1_3DT.append(score_DT[2])
        avgPREC_3DT.append(score_DT[3])
        avgREC_3DT.append(score_DT[4])

        avgROC_allRF.append(score_RF[0])
        avgACC_allRF.append(score_RF[1])
        avgF1_allRF.append(score_RF[2])
        avgPREC_allRF.append(score_RF[3])
        avgREC_allRF.append(score_RF[4])

        avgROC_3RF.append(score_RF[0])
        avgACC_3RF.append(score_RF[1])
        avgF1_3RF.append(score_RF[2])
        avgPREC_3RF.append(score_RF[3])
        avgREC_3RF.append(score_RF[4])
        
        tup4 = buildModels(train1, test1, docs, docsTrain, basemodel, 4)

        score_DT = tup4[0]
        score_RF = tup4[1]
        
        avgROC_allDT.append(score_DT[0])
        avgACC_allDT.append(score_DT[1])
        avgF1_allDT.append(score_DT[2])
        avgPREC_allDT.append(score_DT[3])
        avgREC_allDT.append(score_DT[4])

        avgROC_4DT.append(score_DT[0])
        avgACC_4DT.append(score_DT[1])
        avgF1_4DT.append(score_DT[2])
        avgPREC_4DT.append(score_DT[3])
        avgREC_4DT.append(score_DT[4])

        avgROC_allRF.append(score_RF[0])
        avgACC_allRF.append(score_RF[1])
        avgF1_allRF.append(score_RF[2])
        avgPREC_allRF.append(score_RF[3])
        avgREC_allRF.append(score_RF[4])

        avgROC_4RF.append(score_RF[0])
        avgACC_4RF.append(score_RF[1])
        avgF1_4RF.append(score_RF[2])
        avgPREC_4RF.append(score_RF[3])
        avgREC_4RF.append(score_RF[4])

        tup5 = buildModels(train1, test1, docs, docsTrain, basemodel, 5)

        score_DT = tup5[0]
        score_RF = tup5[1]
        
        avgROC_allDT.append(score_DT[0])
        avgACC_allDT.append(score_DT[1])
        avgF1_allDT.append(score_DT[2])
        avgPREC_allDT.append(score_DT[3])
        avgREC_allDT.append(score_DT[4])

        avgROC_5DT.append(score_DT[0])
        avgACC_5DT.append(score_DT[1])
        avgF1_5DT.append(score_DT[2])
        avgPREC_5DT.append(score_DT[3])
        avgREC_5DT.append(score_DT[4])

        avgROC_allRF.append(score_RF[0])
        avgACC_allRF.append(score_RF[1])
        avgF1_allRF.append(score_RF[2])
        avgPREC_allRF.append(score_RF[3])
        avgREC_allRF.append(score_RF[4])

        avgROC_5RF.append(score_RF[0])
        avgACC_5RF.append(score_RF[1])
        avgF1_5RF.append(score_RF[2])
        avgPREC_5RF.append(score_RF[3])
        avgREC_5RF.append(score_RF[4])

    print("average DT ROC:", sum(avgROC_allDT)/float(len(avgROC_allDT)))
    print("average DT ACC:", sum(avgACC_allDT)/float(len(avgACC_allDT)))
    print("average DT F1:", sum(avgF1_allDT)/float(len(avgF1_allDT)))
    print("average DT PREC:", sum(avgPREC_allDT)/float(len(avgPREC_allDT)))
    print("average DT REC:", sum(avgREC_allDT)/float(len(avgREC_allDT)))
    
    print("average RF ROC:", sum(avgROC_allRF)/float(len(avgROC_allRF)))
    print("average RF ACC:", sum(avgACC_allRF)/float(len(avgACC_allRF)))
    print("average RF F1:", sum(avgF1_allRF)/float(len(avgF1_allRF)))
    print("average RF PREC:", sum(avgPREC_allRF)/float(len(avgPREC_allRF)))
    print("average RF REC:", sum(avgREC_allRF)/float(len(avgREC_allRF)))
    
    ROCsDT.append(sum(avgROC_allDT)/float(len(avgROC_allDT)))
    accuraciesDT.append(sum(avgACC_allDT)/float(len(avgACC_allDT)))
    F1_scoresDT.append(sum(avgF1_allDT)/float(len(avgF1_allDT)))
    precisionsDT.append(sum(avgPREC_allDT)/float(len(avgPREC_allDT)))
    recallsDT.append(sum(avgREC_allDT)/float(len(avgREC_allDT)))
        
    ROCsRF.append(sum(avgROC_allRF)/float(len(avgROC_allRF)))
    accuraciesRF.append(sum(avgACC_allRF)/float(len(avgACC_allRF)))
    F1_scoresRF.append(sum(avgF1_allRF)/float(len(avgF1_allRF)))
    precisionsRF.append(sum(avgPREC_allRF)/float(len(avgPREC_allRF)))
    recallsRF.append(sum(avgREC_allRF)/float(len(avgREC_allRF)))
                     
    ROCs_1DT.append(sum(avgROC_1DT)/float(len(avgROC_1DT)))
    accuracies_1DT.append(sum(avgACC_1DT)/float(len(avgACC_1DT)))
    F1_scores_1DT.append(sum(avgF1_1DT)/float(len(avgF1_1DT)))
    precisions_1DT.append(sum(avgPREC_1DT)/float(len(avgPREC_1DT)))
    recalls_1DT.append(sum(avgREC_1DT)/float(len(avgREC_1DT)))
                                      
    ROCs_2DT.append(sum(avgROC_2DT)/float(len(avgROC_2DT)))
    accuracies_2DT.append(sum(avgACC_2DT)/float(len(avgACC_2DT)))
    F1_scores_2DT.append(sum(avgF1_2DT)/float(len(avgF1_2DT)))
    precisions_2DT.append(sum(avgPREC_2DT)/float(len(avgPREC_2DT)))
    recalls_2DT.append(sum(avgREC_2DT)/float(len(avgREC_2DT)))
                     
    ROCs_3DT.append(sum(avgROC_3DT)/float(len(avgROC_3DT)))
    accuracies_3DT.append(sum(avgACC_3DT)/float(len(avgACC_3DT)))
    F1_scores_3DT.append(sum(avgF1_3DT)/float(len(avgF1_3DT)))
    precisions_3DT.append(sum(avgPREC_3DT)/float(len(avgPREC_3DT)))
    recalls_3DT.append(sum(avgREC_3DT)/float(len(avgREC_3DT)))
                     
    ROCs_4DT.append(sum(avgROC_4DT)/float(len(avgROC_4DT)))
    accuracies_4DT.append(sum(avgACC_4DT)/float(len(avgACC_4DT)))
    F1_scores_4DT.append(sum(avgF1_4DT)/float(len(avgF1_4DT)))
    precisions_4DT.append(sum(avgPREC_4DT)/float(len(avgPREC_4DT)))
    recalls_4DT.append(sum(avgREC_4DT)/float(len(avgREC_4DT)))
                     
    ROCs_5DT.append(sum(avgROC_5DT)/float(len(avgROC_5DT)))
    accuracies_5DT.append(sum(avgACC_5DT)/float(len(avgACC_5DT)))
    F1_scores_5DT.append(sum(avgF1_5DT)/float(len(avgF1_5DT)))
    precisions_5DT.append(sum(avgPREC_5DT)/float(len(avgPREC_5DT)))
    recalls_5DT.append(sum(avgREC_5DT)/float(len(avgREC_5DT)))
    
    ROCs_1RF.append(sum(avgROC_1RF)/float(len(avgROC_1RF)))
    accuracies_1RF.append(sum(avgACC_1RF)/float(len(avgACC_1RF)))
    F1_scores_1RF.append(sum(avgF1_1RF)/float(len(avgF1_1RF)))
    precisions_1RF.append(sum(avgPREC_1RF)/float(len(avgPREC_1RF)))
    recalls_1RF.append(sum(avgREC_1RF)/float(len(avgREC_1RF)))
                                      
    ROCs_2RF.append(sum(avgROC_2RF)/float(len(avgROC_2RF)))
    accuracies_2RF.append(sum(avgACC_2RF)/float(len(avgACC_2RF)))
    F1_scores_2RF.append(sum(avgF1_2RF)/float(len(avgF1_2RF)))
    precisions_2RF.append(sum(avgPREC_2RF)/float(len(avgPREC_2RF)))
    recalls_2RF.append(sum(avgREC_2RF)/float(len(avgREC_2RF)))
                     
    ROCs_3RF.append(sum(avgROC_3RF)/float(len(avgROC_3RF)))
    accuracies_3RF.append(sum(avgACC_3RF)/float(len(avgACC_3RF)))
    F1_scores_3RF.append(sum(avgF1_3RF)/float(len(avgF1_3RF)))
    precisions_3RF.append(sum(avgPREC_3RF)/float(len(avgPREC_3RF)))
    recalls_3RF.append(sum(avgREC_3RF)/float(len(avgREC_3RF)))
                     
    ROCs_4RF.append(sum(avgROC_4RF)/float(len(avgROC_4RF)))
    accuracies_4RF.append(sum(avgACC_4RF)/float(len(avgACC_4RF)))
    F1_scores_4RF.append(sum(avgF1_4RF)/float(len(avgF1_4RF)))
    precisions_4RF.append(sum(avgPREC_4RF)/float(len(avgPREC_4RF)))
    recalls_4RF.append(sum(avgREC_4RF)/float(len(avgREC_4RF)))
                     
    ROCs_5RF.append(sum(avgROC_5RF)/float(len(avgROC_5RF)))
    accuracies_5RF.append(sum(avgACC_5RF)/float(len(avgACC_5RF)))
    F1_scores_5RF.append(sum(avgF1_5RF)/float(len(avgF1_5RF)))
    precisions_5RF.append(sum(avgPREC_5RF)/float(len(avgPREC_5RF)))
    recalls_5RF.append(sum(avgREC_5RF)/float(len(avgREC_5RF)))
    
    print("--------------------------------------------------")

print("------------------------------------------------------")
print("FINAL RESULTS DECISION TREE (all):")
print("ROCs:")
print(ROCsDT)
print("accuracies:")
print(accuraciesDT)
print("F1-scores:")
print(F1_scoresDT)
print("precisions:")
print(precisionsDT)
print("recalls:")
print(recallsDT)

print("FINAL RESULTS RANDOM FOREST (all):")
print("ROCs:")
print(ROCsRF)
print("accuracies:")
print(accuraciesRF)
print("F1-scores:")
print(F1_scoresRF)
print("precisions:")
print(precisionsRF)
print("recalls:")
print(recalls)

# print("FINAL RESULTS FOR CLASS 1:")
# print("ROCs:")
# print(ROCs_1)
# print("accuracies:")
# print(accuracies_1)
# print("F1-scores:")
# print(F1_scores_1)
# print("precisions:")
# print(precisions_1)
# print("recalls:")
# print(recalls_1)

# print("FINAL RESULTS FOR CLASS 2:")
# print("ROCs:")
# print(ROCs_2)
# print("accuracies:")
# print(accuracies_2)
# print("F1-scores:")
# print(F1_scores_2)
# print("precisions:")
# print(precisions_2)
# print("recalls:")
# print(recalls_2)

# print("FINAL RESULTS FOR CLASS 3:")
# print("ROCs:")
# print(ROCs_3)
# print("accuracies:")
# print(accuracies_3)
# print("F1-scores:")
# print(F1_scores_3)
# print("precisions:")
# print(precisions_3)
# print("recalls:")
# print(recalls_3)

# print("FINAL RESULTS FOR CLASS 4:")
# print("ROCs:")
# print(ROCs_4)
# print("accuracies:")
# print(accuracies_4)
# print("F1-scores:")
# print(F1_scores_4)
# print("precisions:")
# print(precisions_4)
# print("recalls:")
# print(recalls_4)

# print("FINAL RESULTS FOR CLASS 5:")
# print("ROCs:")
# print(ROCs_5)
# print("accuracies:")
# print(accuracies_5)
# print("F1-scores:")
# print(F1_scores_5)
# print("precisions:")
# print(precisions_5)
# print("recalls:")
# print(recalls_5)
    
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('METAROUND', 1)
('ROUND', 1)




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Building and training w2v models
Parsing sentences from test set
Scoring test set


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Parsing sentences from training set
scoring test set
decision tree performance on self
0.870099713989
random forest performance on self
0.98463476206
Parsing sentences from training set
scoring test set
decision tree performance on self
0.683860653031
random forest performance on self
0.960538784725
Parsing sentences from training set
scoring test set
decision tree performance on self
0.742178570781
random forest performance on self
0.966767198797
Parsing sentences from training set
scoring test set
decision tree performance on self
0.814821160324
random forest performance on self
0.981294972457
Parsing sentences from training set
scoring test set
decision tree performance on self
0.829191950507
random forest performance on self
0.977176534487
('ROUND', 2)
Building and training w2v models
Parsing sentences from test set
Scoring test set
Parsing sentences from training set
scoring test set
decision tree performance on self
0.868019101101
random forest performance on self
0.984892985559


KeyboardInterrupt: 

In [None]:
import scipy.stats as st

print(" Accuracy")
print(st.t.interval(0.95, len(accuracies)-1, loc=np.mean(accuracies), scale=st.sem(accuracies)))
print("ROC")
print(st.t.interval(0.95, len(ROCs)-1, loc=np.mean(ROCs), scale=st.sem(ROCs)))
print("F1 score")
print(st.t.interval(0.95, len(F1_scores)-1, loc=np.mean(F1_scores), scale=st.sem(F1_scores)))
print("Precision")
print(st.t.interval(0.95, len(precisions)-1, loc=np.mean(precisions), scale=st.sem(precisions)))
print("Recall")
print(st.t.interval(0.95, len(recalls)-1, loc=np.mean(recalls), scale=st.sem(recalls)))

In [12]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences