In [103]:
#import pickle
#import os
import sys
import datetime
import pandas as pd
import numpy as np
import nltk.data
from gensim.models import Word2Vec
#import multiprocessing
from copy import deepcopy
from sklearn.metrics import roc_auc_score    
from sklearn.cross_validation import StratifiedKFold
#import array
import random

In [4]:
np.random.seed(10)

In [7]:
import os.path
sys.path.append(os.path.expanduser('~/code/eol_hsrl_python'))
os.environ['PYTHONHASHSEED']='10'
#is this working?? no idea

In [10]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

### Read in datafile

In [15]:
# Read data from files
df = pd.read_csv( 'labeledTrainData.tsv', header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,  delimiter="\t", quoting=3 )

In [19]:
df[0:10]

Unnamed: 0,id,boolean_label,abstract
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
5,"""8196_8""",1,"""I dont know why people think this is such a b..."
6,"""7166_2""",0,"""This movie could have been very good, but com..."
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm..."
8,"""319_1""",0,"""A friend of mine bought this film for £1, and..."
9,"""8713_10""",1,"""<br /><br />This movie is full of references...."


In [18]:
df.columns = ['id', 'boolean_label', 'abstract']

In [65]:
"""
docprob takes two lists
* docs: a list of documents, each of which is a list of sentences
* models: the candidate word2vec models (each potential class)

it returns the array of class probabilities.  Everything is done in-memory.
"""

def docprob(docs, mods):
    # score() takes a list [s] of sentences here; could also be a sentence generator
    sentlist = [s for d in docs for s in d]
    # the log likelihood of each sentence in this review under each w2v representation
    llhd = np.array( [ m.score(sentlist, len(sentlist)) for m in mods ] )
    # now exponentiate to get likelihoods, 
    lhd = np.exp(llhd - llhd.max(axis=0)) # subtract row max to avoid numeric overload
    # normalize across models (stars) to get sentence-star probabilities
    #all this transposing business does is make it so the total probability of a word 
    #   equals 1 between the 2 arrays (positive prob and negative).
    #and the pandas data frame just puts everything into rows/columns format for easy viz
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    # and finally average the sentence probabilities to get the review probability
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

### get training and testing split on entire dataset with CV loop. randomly sample a subset of size 200 and build a Word2Vec model on it, repeat 100 times. Append all trained models to a meta array to store.

In [131]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['boolean_label'].values

#set random state
skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=10)

cv_score = []

i = 1
for train_index, test_index in skf:

    print("ROUND", i)
    i = i + 1
    #use the indexes to subset the df pandas dataframe
    train1, test1 = df.iloc[train_index], df.iloc[test_index]

    #BUILD TEST SET
    print "Parsing test sentences" #into positive and negative groups so know true label

    docs = []
    for k in range(len(test1)):
        review = test1["abstract"].iloc[k]
        sents = KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
        docs.append(sents)

    #ADD: create models array to append all models to
    modsMeta = []
    metaPreds = []
    metaProbs = []

    print "Building and training w2v models"
    #SET UP LOOP TO BUILD 100 SUB-MODELS
    for j in range(100):

        #get randomly drawn subset from training set-
        #NOTE: sampling *with* replacement
        #size of 200

        indices = random.sample(xrange(train1.shape[0]),200)
        indices = np.asarray(indices)

        #use the indexes to subset the df pandas dataframe and get the associated rows
        train_sub = train1.iloc[indices]
        #print train_sub.shape

        # Word2Vec Algorithm- only building basemodel vocab and training on randomly drawn subset

        sentences = []  # Initialize an empty list of sentences
        for l in range(len(train_sub)):
            review = train_sub['abstract'].iloc[l]
            sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        #print "Parsing sentences from unlabeled set"
        #for review in unlabeled_train["review"]:
        #    sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        ## create a w2v learner 
        basemodel = Word2Vec(
            sentences=None,
            size=100, #default
            window=5, #default
            workers=1,   # don't use cores- use single worker thread for fully deterministically-reproducible run
            iter=1, # iter = sweeps of SGD through the data; more is better
            hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
        )

        basemodel.build_vocab(sentences) 

        # ****** Split the training set into clean sentences
        #
        sentences_pos = []  # Initialize an empty list of sentences
        sentences_neg = []  # Initialize an empty list of sentences

        #here change to include all journal name labels of positive and negative
        inxs_pos = np.where(train_sub['boolean_label'] == 1)[0].tolist()
        inxs_neg = np.where(train_sub['boolean_label'] == 0)[0].tolist()

        for inx in inxs_pos:
            review = train_sub["abstract"].iloc[inx]
            sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        for inx in inxs_neg:
            review = train_sub["abstract"].iloc[inx]
            sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

        #train models
        models = [deepcopy(basemodel) for y in range(2)]
        models[0].train(sentences_neg, total_examples=len(sentences_neg) )
        models[1].train(sentences_pos, total_examples=len(sentences_pos) )

        probs = docprob(docs,models).astype(object)
        metaProbs.append(probs)

        predictions = np.ones((probs.shape[0]))

        predictions[np.where(probs.iloc[:,0] > 0.5)] = 0 # The first column is the negative model

        #append 0/1 predictions to array to store
        metaPreds.append(predictions)

        #append to modsMeta array to store model
        modsMeta.append(models)

    #NEXT: EVALUATE TEST SET ON THE ENSEMBLE AND ASSIGN HIGHEST VOTED VALUE
    print "Evaluating ensemble models on test set"
    ensemblePreds = []

    print len(metaPreds)

    #for each test record,
    for m in range(len(metaPreds[0])):
        sum = 0
        #get ensemble prediction
        for arr in metaPreds:
            #sum the scores received from each model for that review
            sum = sum + arr[m]
        #take the average to see if more models gave a score of 0 or 1
        avg = sum/len(metaPreds)
        #append overall 'winning' prediction
        if avg >= 0.5:
            ensemblePreds.append(1)
        else: 
            ensemblePreds.append(0)

    ensemblePreds = np.asarray(ensemblePreds)
    print ensemblePreds

    score = roc_auc_score(test1["boolean_label"], ensemblePreds)
    print score

    cv_score.append(score)
    
print("-----------------------------------------------")
print("The averaged roc_auc over 5 folds is:")
print np.sum(cv_score, dtype=np.float64)/len(cv_score)
                        
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Parsing test sentences
Building and training w2v models
Evaluating ensemble models on test set
100
[1 1 0 ..., 0 1 0]
0.7928
('ROUND', 2)
Parsing test sentences
Building and training w2v models
Evaluating ensemble models on test set
100
[0 0 0 ..., 1 1 0]
0.7904
('ROUND', 3)
Parsing test sentences
Building and training w2v models
Evaluating ensemble models on test set
100
[1 1 0 ..., 0 0 0]
0.8058
('ROUND', 4)
Parsing test sentences
Building and training w2v models
Evaluating ensemble models on test set
100
[1 0 1 ..., 1 0 1]
0.8102
('ROUND', 5)
Parsing test sentences
Building and training w2v models
Evaluating ensemble models on test set
100
[0 0 0 ..., 1 0 1]
0.8094


TypeError: 'numpy.float64' object is not callable

In [140]:
print("-----------------------------------------------")
print("The averaged roc_auc over 5 folds is:")
print np.sum(cv_score, dtype=np.float64)/len(cv_score)

-----------------------------------------------
The averaged roc_auc over 5 folds is:
0.80172


In [141]:
train1.shape

(20000, 3)

In [142]:
test1.shape

(5000, 3)

In [143]:
len(modsMeta)

100

In [144]:
#models stored at different memory locations- should check that this is proper
modsMeta[0:10]

[[<gensim.models.word2vec.Word2Vec at 0x7f18015f8950>,
  <gensim.models.word2vec.Word2Vec at 0x7f180221c510>],
 [<gensim.models.word2vec.Word2Vec at 0x7f181c4a59d0>,
  <gensim.models.word2vec.Word2Vec at 0x7f180d72d7d0>],
 [<gensim.models.word2vec.Word2Vec at 0x7f181bc23890>,
  <gensim.models.word2vec.Word2Vec at 0x7f180cfc61d0>],
 [<gensim.models.word2vec.Word2Vec at 0x7f18157f1050>,
  <gensim.models.word2vec.Word2Vec at 0x7f17ef58ad10>],
 [<gensim.models.word2vec.Word2Vec at 0x7f1814586bd0>,
  <gensim.models.word2vec.Word2Vec at 0x7f17fbf98110>],
 [<gensim.models.word2vec.Word2Vec at 0x7f180200de10>,
  <gensim.models.word2vec.Word2Vec at 0x7f17ef76ad90>],
 [<gensim.models.word2vec.Word2Vec at 0x7f17f80eacd0>,
  <gensim.models.word2vec.Word2Vec at 0x7f180fe293d0>],
 [<gensim.models.word2vec.Word2Vec at 0x7f180d6c1710>,
  <gensim.models.word2vec.Word2Vec at 0x7f17ef0bc2d0>],
 [<gensim.models.word2vec.Word2Vec at 0x7f181bd96890>,
  <gensim.models.word2vec.Word2Vec at 0x7f18019062d0>],
 

### repeat of above but this time using the probability scores sum

In [109]:
a = datetime.datetime.now().replace(microsecond=0)

y = df['boolean_label'].values

#set random state
skf = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=10)

cv_score = []

i = 1
for train_index, test_index in skf:
    
    if i == 1:

        print("ROUND", i)
        i = i + 1
        #use the indexes to subset the df pandas dataframe
        train1, test1 = df.iloc[train_index], df.iloc[test_index]

        #BUILD TEST SET
        print "Parsing test sentences" #into positive and negative groups so know true label

        docs = []
        for k in range(len(test1)):
            review = test1["abstract"].iloc[k]
            sents = KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
            docs.append(sents)

        #ADD: create models array to append all models to
        modsMeta = []
        metaPreds = []
        metaProbsPos = []
        metaProbsNeg = []

        print "Building and training w2v models"
        #SET UP LOOP TO BUILD 100 SUB-MODELS
        for j in range(100):

            #get randomly drawn subset from training set-
            #NOTE: sampling *with* replacement
            #size of 200

            indices = random.sample(xrange(train1.shape[0]),200)
            indices = np.asarray(indices)

            #use the indexes to subset the df pandas dataframe and get the associated rows
            train_sub = train1.iloc[indices]
            #print train_sub.shape

            # Word2Vec Algorithm- only building basemodel vocab and training on randomly drawn subset

            sentences = []  # Initialize an empty list of sentences
            for l in range(len(train_sub)):
                review = train_sub['abstract'].iloc[l]
                sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

            #print "Parsing sentences from unlabeled set"
            #for review in unlabeled_train["review"]:
            #    sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

            ## create a w2v learner 
            basemodel = Word2Vec(
                sentences=None,
                size=100, #default
                window=5, #default
                workers=1,   # don't use cores- use single worker thread for fully deterministically-reproducible run
                iter=1, # iter = sweeps of SGD through the data; more is better
                hs=1, negative=0 # we only have scoring for the hierarchical softmax setup
            )

            basemodel.build_vocab(sentences) 

            # ****** Split the training set into clean sentences
            #
            sentences_pos = []  # Initialize an empty list of sentences
            sentences_neg = []  # Initialize an empty list of sentences

            #here change to include all journal name labels of positive and negative
            inxs_pos = np.where(train_sub['boolean_label'] == 1)[0].tolist()
            inxs_neg = np.where(train_sub['boolean_label'] == 0)[0].tolist()

            for inx in inxs_pos:
                review = train_sub["abstract"].iloc[inx]
                sentences_pos += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

            for inx in inxs_neg:
                review = train_sub["abstract"].iloc[inx]
                sentences_neg += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)

            #train models
            models = [deepcopy(basemodel) for y in range(2)]
            models[0].train(sentences_neg, total_examples=len(sentences_neg) )
            models[1].train(sentences_pos, total_examples=len(sentences_pos) )

            probs = docprob(docs,models).astype(object)

            probsPos = probs.iloc[:,0].values

            probsNeg = probs.iloc[:,1].values

            #append 0/1 predictions to array to store
            metaProbsPos.append(probsPos)
            metaProbsNeg.append(probsNeg)

            #append to modsMeta array to store model
            modsMeta.append(models)

        #NEXT: EVALUATE TEST SET ON THE ENSEMBLE AND ASSIGN HIGHEST VOTED VALUE
#         print "Evaluating ensemble models on test set"
#         ensemblePreds = []

#         print len(metaPreds)

#         #for each test record,
#         for m in range(len(metaPreds[0])):
#             sum = 0
#             #get ensemble prediction
#             for arr in metaPreds:
#                 #sum the scores received from each model for that review
#                 sum = sum + arr[m]
#             #take the average to see if more models gave a score of 0 or 1
#             avg = sum/len(metaPreds)
#             #append overall 'winning' prediction
#             if avg >= 0.5:
#                 ensemblePreds.append(1)
#             else: 
#                 ensemblePreds.append(0)

#         ensemblePreds = np.asarray(ensemblePreds)
#         print ensemblePreds

#         score = roc_auc_score(test1["boolean_label"], ensemblePreds)
#         print score

        #cv_score.append(score)
    
#print("-----------------------------------------------")
#print("The averaged roc_auc over 5 folds is:")
#print np.sum(cv_score, dtype=np.float64)/len(cv_score)
                        
b = datetime.datetime.now().replace(microsecond=0)
print(b-a)

('ROUND', 1)
Parsing test sentences
Building and training w2v models
0:00:31


In [132]:
print probs

             0         1
doc                     
0     0.497906  0.502094
1     0.498477  0.501523
2     0.505409  0.494591
3     0.506429  0.493571
4     0.505806  0.494194
5     0.502609  0.497391
6     0.495817  0.504183
7     0.493731  0.506269
8     0.508329  0.491671
9     0.495422  0.504578
10     0.50761   0.49239
11    0.514158  0.485842
12    0.491369  0.508631
13    0.502536  0.497464
14     0.50127   0.49873
15    0.506424  0.493576
16    0.498985  0.501015
17    0.507005  0.492995
18    0.500511  0.499489
19    0.504316  0.495684
20     0.49867   0.50133
21    0.502826  0.497174
22     0.50237   0.49763
23    0.504566  0.495434
24    0.499047  0.500953
25    0.500693  0.499307
26    0.498172  0.501828
27         0.5       0.5
28    0.503046  0.496954
29    0.514925  0.485075
...        ...       ...
4970  0.511155  0.488845
4971  0.498353  0.501647
4972  0.506089  0.493911
4973  0.499111  0.500889
4974  0.493281  0.506719
4975  0.499493  0.500507
4976  0.503043  0.496957


In [130]:
print probsPos
print probsNeg

print "Evaluating ensemble models on test set"
ensemblePreds = []

print len(metaPreds)

#for each test record,
for m in range(len(metaPreds[0])):
    sum = 0
    #get ensemble prediction
    for arr in metaPreds:
        #sum the scores received from each model for that review
        sum = sum + arr[m]
    #take the average to see if more models gave a score of 0 or 1
    avg = sum/len(metaPreds)
    #append overall 'winning' prediction
    if avg >= 0.5:
        ensemblePreds.append(1)
    else: 
        ensemblePreds.append(0)

ensemblePreds = np.asarray(ensemblePreds)
print ensemblePreds

[0.4960238039493561 0.4999992549419403 0.49933910369873047 ..., 0
 0.4936591684818268 0]
[0.5039762258529663 0.5000007152557373 0.5006608963012695 ...,
 0.49954429268836975 0.5063408613204956 0.4926792085170746]


### get to this (and all other code below) later once all models are built

In [10]:
print "Parsing test sentences" #into positive and negative groups so know true label

sentences_pos_test = []  # Initialize an empty list of sentences
sentences_neg_test = []  # Initialize an empty list of sentences
    
inxs_pos_test = np.where(test1['boolean_label'] == 1)[0].tolist()
inxs_neg_test = np.where(test1['boolean_label'] == 0)[0].tolist()


for inx in inxs_pos_test[0:10]:
    review = test1["abstract"].iloc[inx]
    sentences_pos_test += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
    #print inx

for inx in inxs_neg_test[0:10]:
    review = test1["abstract"].iloc[inx]
    sentences_neg_test += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
    #print inx

Parsing test sentences


In [11]:
#save matrices in new variables to load each time, as well as the negative and positive trained models
neg1 = models[0].syn0
neg2 = models[0].syn1
modNeg = models[0]
modPos = models[1]
pos1 = models[1].syn0
pos2 = models[1].syn1

In [12]:
print sentences_pos_test[0]

[u'i', u'am', u'amazed', u'that', u'movies', u'like', u'this', u'can', u'still', u'be', u'made']


In [13]:
#retrain model using a test sentence- 
modPos.train(sentences_pos_test[0], total_examples=1 )
modNeg.train(sentences_pos_test[0], total_examples=1 )

31

### Now look at weight matrices again....how much have they changed?

In [17]:
#look at dims
print neg1.shape
print neg2.shape
print pos1.shape
print pos2.shape

(39795, 50)
(39795, 50)
(39795, 50)
(39795, 50)


### TODO: test that the original weights are actually being stored and not also updated. Then, try for every word in the sentence by recoding the below script.

In [19]:
#all = []
#for sent in sentences_pos_test:
#make copies of models
#    copymodPos = modPos
#    copymodNeg = modNeg
a=[]
#retrain model using a test sentence
#    copymodPos.train(sent, total_examples=1 )
#    copymodNeg.train(sent, total_examples=1 )
#for each word....replace the 0's 

#NOTE: Changed FROM COPYMODNEG TO MODNEG AND SAME FOR MODPOS
for i in range(models[0].syn0.shape[0]):
    a.append(np.subtract(neg1[i], modNeg.syn0[i]))
    a.append(np.subtract(neg2[i], modNeg.syn1[i]))
    a.append(np.subtract(pos1[i], modPos.syn0[i]))
    a.append(np.subtract(pos2[i], modPos.syn1[i]))

    if np.any(np.not_equal(neg1[i],modNeg.syn0[i])):
        print "1"
        print i
        print neg1[i]
        print sent
    elif np.any(np.not_equal(neg2[i],modNeg.syn1[i])):
        print "2"
        print i
        print neg2[i]
        print sent
    elif np.any(np.not_equal(pos1[i],modPos.syn0[i])):
        print "3"
        print i
        print pos1[i]
        print sent
    elif np.any(np.not_equal(pos2[i],modPos.syn1[i])):
        print "4"
        print i
        print pos2[i]
        print sent

#then also append the score to the list*** TODO

#all.append(a)


In [None]:
#repeat for sentences in sentences_neg_test

In [14]:
print modPos["this"]
print models[1]["this"]
#still identical

[  6.72277287e-02   1.26543835e-01   9.93497670e-03  -5.40592745e-02
  -2.65558302e-01  -9.68429521e-02   8.76064673e-02  -1.33147120e-01
   4.74467762e-02   1.02694966e-01   2.79902313e-02  -5.03830649e-02
  -1.29035428e-01   8.29034764e-03   4.33472246e-02   2.05678880e-01
   1.59809381e-01   3.38250399e-02  -2.10265428e-01  -1.30878299e-01
  -1.95322976e-01  -2.52045747e-02   1.68052465e-01  -4.24853526e-02
   1.19723156e-01   4.54059616e-02   4.93345559e-02  -6.80889115e-02
  -2.11434722e-01  -1.75505623e-01  -1.63846120e-01   8.19133222e-02
   8.09772909e-02   6.10496514e-02   3.02647091e-02   5.76601438e-02
   6.89781224e-03   2.18035445e-01   1.95809379e-01  -1.67943567e-01
   5.93783334e-05   7.96362162e-02  -3.42507352e-04  -8.06212574e-02
   1.97106436e-01  -4.97829355e-02  -9.23664309e-03   1.54408872e-01
  -9.93900597e-02  -1.28648549e-01]
[  6.72277287e-02   1.26543835e-01   9.93497670e-03  -5.40592745e-02
  -2.65558302e-01  -9.68429521e-02   8.76064673e-02  -1.33147120e-0

In [45]:
print modNeg["this"]
print models[0]["this"]
#still identical

[  1.12128623e-01  -5.90351527e-04   1.90263346e-01  -2.04363316e-01
  -1.25054032e-01   1.41420200e-01   2.07533948e-02  -5.16388237e-01
  -2.71282345e-01  -2.74658352e-01   4.18581665e-01   6.85895383e-01
   1.61994889e-01   1.41382843e-01   2.19165564e-01   6.37463620e-03
   1.48713633e-01   4.50969517e-01   2.83410162e-01  -2.03582317e-01
  -1.41550124e-01  -3.66136700e-01   1.05822659e+00  -7.81226233e-02
   3.40196371e-01  -4.13311690e-01  -1.34569913e-01  -5.01110256e-02
   5.74484766e-01   4.05226871e-02   1.15871266e-01  -9.73553479e-01
  -1.10163584e-01  -3.38168532e-01   9.53039765e-01  -1.01441734e-01
   4.13049966e-01   4.90189455e-02  -4.38492864e-01  -5.91962934e-02
  -9.26226377e-01   4.83608961e-01  -7.15108156e-01   3.62209111e-01
   6.68655038e-01   2.67568588e-01   3.60578388e-01   7.26784527e-01
  -2.97102273e-01   4.04141605e-01  -4.31873351e-01  -1.42890409e-01
   3.50398183e-01   3.67146939e-01   7.04841539e-02  -1.48633216e-02
  -1.14153242e+00  -3.92584890e-01

In [47]:
#mini testing
mods = []
ab = [1, 2]
cd = [3, 4]
mods.append(ab)
mods.append(cd)
print(mods)

[[1, 2], [3, 4]]


In [39]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords


class KaggleWord2VecUtility(object):
    """KaggleWord2VecUtility is a utility class for processing raw HTML text into segments for further learning"""

    @staticmethod
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
    
    @staticmethod
    def review_to_words( review, remove_stopwords=False ):
        # Function to convert a raw review to a string of words
        # The input is a single string (a raw movie review), and 
        # the output is a single string (a preprocessed movie review)
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text() 
        #
        # 2. Remove non-letters        
        review_text = re.sub("[^a-zA-Z]", " ", review_text) 
        #
        # 3. Convert to lower case, split into individual words
        words = review_text.lower().split()                             
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]   
        #
        # 6. Join the words back into one string separated by space, 
        # and return the result.
        return( " ".join( words ))   

    # Define a function to split a review into parsed sentences
    @staticmethod
    def review_to_sentences( review, tokenizer, remove_stopwords=False ):
        # Function to split a review into parsed sentences. Returns a
        # list of sentences, where each sentence is a list of words
        #
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
                  remove_stopwords ))
        #
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences