In [8]:
import pickle
import sys
import os
import datetime

import sklearn
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

In [9]:
# Read data from files
df = pd.read_csv('Reviews.csv')

In [10]:
df.columns = ['Id', 'ProductId', 'UserId', "ProfileName", 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'label', 'Time', 'title', 'abstract']

In [11]:
df[0:2]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,label,Time,title,abstract
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [12]:
#randomly subset those with label 5 to be more similar to actual amount (*** and most importantly to reduce size of dataset)
df_5 = df.loc[df['label'] == 5]
df_5 = df_5.sample(n=50000, random_state=10)
print len(df_5)

In [13]:
print len(df.loc[df['label'] != 5])
df_allElse = df.loc[df['label'] != 5]

205332


In [17]:
df = pd.concat([df_allElse, df_5])
print len(df)

255332


### 5-fold cross validation to create bags of words and random forests

In [18]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['label'].values
skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=10)

#if we want to set max features or not for the BOWs
maxF = 5000

j = 1
avg = []
avg1 = []
avg2 = []
avg3 = []
avg4 = []
avg5 = []
for train_index, test_index in skf:

    print("ROUND", j)
    j = j + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]

    print "Cleaning and parsing the training set abstracts...\n"
    # Initialize an empty list to hold the clean reviews
    clean_train_reviews = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the pandas df column
    for i in range(train1["abstract"].size):
        # Call our function for each one, and add the result to the list of clean reviews
        words = KaggleWord2VecUtility.review_to_words( train1["abstract"].iloc[i])
        clean_train_reviews.append(words)

    print len(clean_train_reviews)

    #with tfidf word weighting
    #uses L2 norm by default
    print "Creating the bag of words...\n"

    # Initialize the "TfidfVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                            #already removed stopwords when preprocessing reviews into words
                             stop_words = None,
                            max_features = maxF)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of 
    # strings.
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    # Numpy arrays are easy to work with, so convert the result to an 
    # array
    train_data_features = train_data_features.toarray()

    #Moving on to test set...
    # Create an empty list and append the clean reviews one by one
    clean_test_reviews = [] 

    print "Cleaning and parsing the test set abstracts...\n"
    for i in range(test1["abstract"].size):    
        words = KaggleWord2VecUtility.review_to_words( test1["abstract"].iloc[i])
        clean_test_reviews.append(words)

    print "Adding tfidf weights and converting to Bag of Words..."
    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = vectorizer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    #------------------------------------------------------------------------------------------------------------    
    print "Training the random forest..."

    #binarize labels
    x = label_binarize(train1['label'], classes=[1,2,3,4,5])
    n_classes = x.shape[1]

    x_test = label_binarize(test1['label'], classes=[1,2,3,4,5])

    # Initialize a Random Forest classifier with 100 trees
    forest = OneVsRestClassifier(RandomForestClassifier(n_estimators = 100, max_depth=1000, n_jobs=-1))

    # Fit the forest to the training set, using the bag of words as 
    # features and the sentiment labels as the response variable
    #
    # This may take a few minutes to run
    forest = forest.fit( train_data_features, x )

    print "Using the random forest to make sentiment label predictions..."
    result = forest.predict(test_data_features)

    print "Scoring the test set"

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(x_test[:, i], result[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    print roc_auc

    avg1.append(roc_auc[0])
    avg2.append(roc_auc[1])
    avg3.append(roc_auc[2])
    avg4.append(roc_auc[3])
    avg5.append(roc_auc[4])

    avg.append(roc_auc[0])
    avg.append(roc_auc[1])
    avg.append(roc_auc[2])
    avg.append(roc_auc[3])
    avg.append(roc_auc[4])
    
print("average of Score 1 model:", sum(avg1)/float(len(avg1)))
print("average of Score 2 model:", sum(avg2)/float(len(avg2)))
print("average of Score 3 model:", sum(avg3)/float(len(avg3)))
print("average of Score 4 model:", sum(avg4)/float(len(avg4)))
print("average of Score 5 model:", sum(avg5)/float(len(avg5)))
print ""
print("average across all models:", sum(avg)/float(len(avg)))

b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Cleaning and parsing the training set abstracts...

204265
Creating the bag of words...

Cleaning and parsing the test set abstracts...

Adding tfidf weights and converting to Bag of Words...
Training the random forest...
Using the random forest to make sentiment label predictions...
Scoring the test set
{0: 0.7642716318468824, 1: 0.69189420071649144, 2: 0.69394892572259159, 3: 0.71699524681116911, 4: 0.65852990965982416}
('ROUND', 2)
Cleaning and parsing the training set abstracts...

204265
Creating the bag of words...

Cleaning and parsing the test set abstracts...

Adding tfidf weights and converting to Bag of Words...
Training the random forest...
Using the random forest to make sentiment label predictions...
Scoring the test set
{0: 0.76087296903290735, 1: 0.69351193328206606, 2: 0.69987059551621256, 3: 0.71802271254118355, 4: 0.65404809701219946}
('ROUND', 3)
Cleaning and parsing the training set abstracts...

204265
Creating the bag of words...

Cleaning and parsin

In [16]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords

from nltk import PorterStemmer

class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False, stemmer=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
            
        if stemmer:
            words = [PorterStemmer().stem_word(w) for w in words]
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences