# Starbucks Capstone Challenge: Train and evaluate classifier

## Contents
1. Load prepared data. 
2. Build pipeline.
3. Define metric function for crossvalidation (expected profit per user).
4. Tune classifier and record classifier performance.
5. References.

In [1]:
import pandas as pd
import pickle 
from datetime import datetime

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import multilabel_confusion_matrix, make_scorer

### 1. Load prepared data. 

In [2]:
# load data 
prep_data = pd.read_pickle('../data/processed/prepared_data.pkl')

In [3]:
# split data in features (X) and labels (y)
X = prep_data.drop(['bogo', 'discount'], axis=1)

y = prep_data[['bogo', 'discount']]

### 2. Building pipeline

In [4]:
# Make preprocessing step to onehotencode features

# intantiate OHE
categorical_encoder = OneHotEncoder(handle_unknown='ignore')
    
# List categorical variables in to be OHE
categorical_columns = ['income_group','age_group','gender']

preprocessing = ColumnTransformer(
        [
            ('cat', categorical_encoder, categorical_columns),
        ]
    )

In [5]:
# intantiate classifier
clf = MultiOutputClassifier(BernoulliNB())

In [6]:
# Build pipeline
model = Pipeline(
    [
        ('preprocess', preprocessing),
        ('classifier', clf),
    ])

### 3.  Define metric function

In [7]:
def cal_exp_prof(y_test, y_pred):
    
    '''
    This is a custom scorer built for evaluating classifier. 
    A custom scorer was needed since the pre-builting scorers 
    (eg accuracy) was not thought to be appropriate for this 
    classifier. The costs of incorrectly predicting a users 
    responsivness and the benefits of correctly predicting 
    a users responsivenss are assummed here to be unequal. 
    An expected profit per user was calculated following the 
    methodology outlined in Data Science for Business by 
    F.Provost and T.Fawcett.
    This function returns the expected profit per user based 
    on the predicted responsivness labels assigned to each user 
    for BOGO and discount offers compared to the actual offers.
    '''
    
    # Defining benefits associated with true positive, true negatives
    b_tp = 10
    b_tn = 0

    # Defining costs (or negative benefits) associated with false postives, false negatives
    b_fp = -1
    b_fn = -10 

    # Calculating probabilities needed to compute expected profit 
    p_bogo = y_test['bogo'].value_counts().sort_index(ascending=True)[1]
    n_bogo = y_test['bogo'].value_counts().sort_index(ascending=True)[0]

    p_disc = y_test['discount'].value_counts().sort_index(ascending=True)[1]
    n_disc = y_test['discount'].value_counts().sort_index(ascending=True)[0]

    prob_p_bogo = p_bogo/(p_bogo + n_bogo)
    prob_n_bogo = n_bogo/(p_bogo + n_bogo) 

    prob_p_disc = p_disc/(p_disc + n_disc)
    prob_n_disc = n_disc/(p_disc + n_disc)

    # Extracting values from confusion matrix 
    tn_bogo, fp_bogo, fn_bogo, tp_bogo = multilabel_confusion_matrix(y_test, y_pred)[0].ravel()
    tn_disc, fp_disc, fn_disc, tp_disc = multilabel_confusion_matrix(y_test, y_pred)[1].ravel()
     
    # Calculating the probability of tn, tp, fn, fp for BOGO and discount offers
    p_tp_bogo = tp_bogo/p_bogo
    p_tn_bogo = tn_bogo/n_bogo 
    
    p_tp_disc = tp_disc/p_disc
    p_tn_disc = tn_disc/n_disc 

    p_fp_bogo = fp_bogo/n_bogo
    p_fn_bogo = fn_bogo/p_bogo

    p_fp_disc = fp_disc/n_disc
    p_fn_disc = fn_disc/p_disc
    

     # Expected profit per user for BOGO and discount offers
    E_prof_bogo = (prob_p_bogo * (p_tp_bogo * b_tp + p_fn_bogo * b_fn)) + (prob_n_bogo * (p_tn_bogo * b_tn + p_fp_bogo * b_fp))
                                                    
    E_prof_disc = (prob_p_disc * (p_tp_disc * b_tp + p_fn_disc * b_fn)) + (prob_n_disc * (p_tn_disc * b_tn + p_fp_disc * b_fp))

   # Total expected profit per user
    E_prof = E_prof_bogo + E_prof_disc
    
    return E_prof

### 4. Tune classifier and record classifier performance

In [8]:
# Define classifier parameters and values for hypertuning 
parameters = {}
parameters['classifier__estimator__alpha'] = [1,2,3]
parameters['classifier__estimator__fit_prior'] = (True, False)
parameters['classifier__estimator__binarize'] = [0,1,2,4]

In [9]:
# Prepare custom scorer for use in GridSearchCV and cross validation
class_score = make_scorer(cal_exp_prof)

In [10]:
# Apply GridSearchCV to pipeline using crossvalidation 
mod_tun = GridSearchCV(model, param_grid=parameters, cv=5, scoring=class_score)

# Fit pipeline on data to determine best parameters using crossvalidation 
# for train and test splits and custom scoring metric. 
mod_tun.fit(X,y);

print('Best parameters are: {}'.format(mod_tun.best_params_))
print('Highest expected profit per user: {}'.format(mod_tun.best_score_))

Best parameters are: {'classifier__estimator__alpha': 1, 'classifier__estimator__binarize': 0, 'classifier__estimator__fit_prior': True}
Highest expected profit per user: 6.749981633623442


In [11]:
# Record performance of classifier 
with open('../experimentlog.txt', 'a') as f:
    f.write('\n \n {}: \n {} \n {} Expected profit per user: {}'. format(datetime.now(), mod_tun, mod_tun.best_params_, mod_tun.best_score_))

### References 
1. Provost, F. & Fawcett, T. (2013) Data Science for Business. 1st edition. United States of America, O’Reilly Media, Inc.