In this notebook we will divide the full_data.db dataset to perform training and validation and test.
In this case is important to take into consideration the class imbalance, that is, we have different number of reviews for 
every rating.
Strategy:
* For each rating type ('delivery_time','service','quality') get the number of reviews in each rating star (1 star, 2 starts, etc.).


In [1]:
#load initial data
import pandas as pd
import numpy as np
df=pd.read_pickle('full_data.db')

In [2]:
#Functiones needed to create a clean version of the tokens in the present in a string (e.g. review)
import dill
with open('TrainedWordsForSpellCheck.pkl', 'rb') as f:
    NWORDS=dill.load(f)

#spell check functions
import re, collections
alphabet = 'abcdefghijklmnopqrstuvwxyz'

def words(text):
    return re.findall('[a-z]+', text.lower())

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model
def edits1(word):
    s = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in s if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in s if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in s for c in alphabet if b]
    inserts    = [a + c + b     for a, b in s for c in alphabet]
    return set(deletes + transposes + replaces + inserts)
def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): 
    return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or    known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)

def correct_top(word, n):
    candidates = known([word]) or known(edits1(word)) or    known_edits2(word) or [word]
    s = sorted(candidates, key=NWORDS.get, reverse=True)
    return s[0], s[:n]
########################################
# building a tokenizer that picks out sequences of alphanumeric characters as tokens 
# and drops everything else 
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# lowercase + tokenize + spellcheck + stopwords
def cleanList(textData):
    wordList=tokenizer.tokenize(textData.lower())
    for w in wordList:
        wordList[wordList.index(w)]=correct(w)
    cleaned = [w for w in wordList if not w in stopwords.words('english')]
    return cleaned


In [3]:
def getReducedDataSet(df,colName,nRows):
# This function created a reduced data frame to be use in training/validation/testing
# input :
# df        --- the full dataset
# colNamed  --- the name of the column (e.g., 'delivery_time','quality','service')
# nRows     --- the number of rows per each star ([1-6]) on the output dataframe
# this function does the following:
# (1) creates a randomization by rows of df
# (2) creates a 'TMP' dataframe with the relevant columns (colName + 'commentary')
# (3) sorts 'TMP' is ascending order using 'colName'
# (4) keeps the rows in which the stars <> 0
# (5) created 6 smaller dataframes (d1...d6) comntaining 'nRows' rows for each star rating
# (6) concatenates (d1...d6) into an output final dataframe
# We can use the function to create dataframes for training with different size (larger)
# then the dataframes for validation/testing. 
# As we are randomizing the initial dataframe at the beginning of the function we can use
# this function to 'bootstrap' the initial dataframe and create a large number
# of training/validation/testing sets.
#-------------------------------------------------------------------------------
    if colName == "delivery_time":
        colNumber = 3
    if colName == "quality":
        colNumber = 4    
    if colName == "service":
        colNumber = 5    
    TMP = df.iloc[np.random.permutation(len(df))] #randomize rows in the dataframe
    TMP = TMP.iloc[:,[colNumber,6]] #get relevant columns
    TMP = TMP.sort([colName], ascending=[1]) #sort data frame 
    TMP = TMP.loc[TMP[colName] <> 0] # take only ratings <> 0

    #getting 'nRows' for each star (1,2,3,4,5,6)
    d1 = TMP.loc[TMP[colName] == 1].iloc[range(nRows),:]
    d2 = TMP.loc[TMP[colName] == 2].iloc[range(nRows),:]
    d3 = TMP.loc[TMP[colName] == 3].iloc[range(nRows),:]
    d4 = TMP.loc[TMP[colName] == 4].iloc[range(nRows),:]
    d5 = TMP.loc[TMP[colName] == 5].iloc[range(nRows),:]
    d6 = TMP.loc[TMP[colName] == 6].iloc[range(nRows),:]
    frames = [d1, d2, d3, d4, d5, d6]
    return pd.concat(frames)

#function to convert a review in a binary vector (using our own list of tokens)
#if token 'i' is present then the 'i' position of the binary vector is set 1, otherwise is set to 0
#stablishing the type (rating) and size of ar arbitrary training set
nRows=5
trainingDF = getReducedDataSet(df,'quality',nRows)
Y = trainingDF['quality'].tolist()




In [4]:
# Cleaning reviews and creating a matrix 'cleanReviews'. cleanReviews[0] will contain a list 
# with the clean tokens from the first review in the dataframe 'trainingDF'
import time
start_time = time.time()
TMP           = trainingDF['commentary'].tolist()
TMP1          = [x.encode('UTF8') for x in TMP]
cleanReviews  = map(cleanList,TMP1)
print("--- %s seconds ... cleaning and encoding...---" % (time.time() - start_time))

--- 0.743999958038 seconds ... cleaning and encoding...---


In [5]:
#OPTIONAL: saving/loading clean reviews before binarization
#import dill
#with open('cleanReviewsForBinarization01.pkl', 'wb') as f1:
#    dill.dump(cleanReviews, f1)
#with open('cleanReviewsForBinarization01.pkl', 'rb') as f:
#    cleanReviews=dill.load(f)

In [6]:
# Binarization functions. Here we avoid using 'black box' binarization to have full control over
# the Feature Vector we obtained by feature engineering. This step could be probably optimized later.
# general var names
# input:
# - (FV)  - 'Feature Vector'        (list of tokens with sentiment)
# - (TFR) - 'Tokens From Review'   (list of tokenize words from a review)
# output:
# - (BRs) - 'Binarized Reviews'     len(BRs[i]) = len(FV), if FV[0] exist in TFR then BRs[0]=1, else BRs[0]=0  

#this is a test 'FV', the real one has to be obtained using feature engineering
##FV = ['caca', 'pepe', 'like', 'place', 'sauce', 'eat', 'curry', 'get', 'money', 'offer', 'food', 'spacial', 'late', 'dry', 'late']
import dill
with open('jeTokensWithVaderSentiments01.pkl', 'rb') as f2:
    tokens = dill.load(f2)
FV = [str(tokens[i][0]) for i in range(len(tokens))]

def binarizeTokenList(Val):
    if TFR.count(Val)>0:
        return 1
    else:
        return 0

def binarizeReviews(cleanReviews,FV): 
    global TFR
    BRs=[]
    for i in cleanReviews:
        TFR=i
        #print TFR
        BRs.append(map(binarizeTokenList,FV))
    return np.asarray(BRs)    
start_time = time.time()    
binaryTrainingSet = binarizeReviews(cleanReviews,FV)
print("--- %s seconds ... binarizing...---" % (time.time() - start_time))
print len(Y)
print len(binaryTrainingSet)

--- 0.018000125885 seconds ... binarizing...---
30
30


In [7]:
# OPTIONAL: saving binarized data
#import dill
#with open('binaryTrainingSetX02.pkl', 'wb') as f1:
#    dill.dump(binaryTrainingSet, f1)
#with open('binaryTrainingSetY02.pkl', 'wb') as f2:
#    dill.dump(Y, f2)

#import cPickle as pickle 
#pickle.dump( Y, open( "binaryTrainingSetY02.pkl", "wb" ) )
#pickle.dump( binaryTrainingSet, open( "binaryTrainingSetX02.pkl", "wb" ) )


In [8]:
#training the BernoulliNB classifier
from sklearn.naive_bayes import BernoulliNB

start_time = time.time()
clfNB = BernoulliNB()
clfNB.fit(binaryTrainingSet, Y)
print("--- %s seconds ... training BernoulliNB ...---" % (time.time() - start_time))

#save the classifier
with open('BernoulliNB_classifierTMP.pkl', 'wb') as f2:
    dill.dump(clfNB, f2)
#vv=np.asarray([[0,1,0,0,0,0,0,1,0,0,0,0,0,0,0],[0,1,0,0,0,0,0,1,0,0,0,0,0,0,0]])
#print(clf.predict(vv))
print type(clfNB)

--- 0.0320000648499 seconds ... training BernoulliNB ...---
<class 'sklearn.naive_bayes.BernoulliNB'>


In [9]:
#OPTIONAL: loading a saved classifier
#import dill
#with open('BernoulliNB_classifierTMP.pkl', 'rb') as f:
#    bernouNB=dill.load(f)
#print type(bernouNB)    

In [10]:
# OPTIONAL: using other machine learning algorithms
# At this point, to train another algorithm like support vector machine (SVM) in the example bellow, 
# is a relativelly simple task but other classifiers have their own complications and the theory behind them should be
# taken into consideration.
####################################
# Example: Training a SVM classifier
from sklearn import svm
start_time = time.time()
clfSVM = svm.SVC(decision_function_shape='ovr')
clfSVM.fit(binaryTrainingSet, Y)
print("--- %s seconds ... training SVM...---" % (time.time() - start_time))
print type(clfSVM)


--- 0.00699996948242 seconds ... training SVM...---
<class 'sklearn.svm.classes.SVC'>


The saved Bernoulli Naive Bayes classifier can now be used for predictions independently of this pipeline. A longer python script (bernouValidation.py) showcases this statement.