### Classifier notebook:
 - Development of multioutput classification model that uses expected profit per user as metric. 
 - This notebook will be cleaned up and separated into multiple notebooks so that it will be easier to understand.

In [16]:
# Importing libraries, CLEAN THIS 
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
#from imblearn.pipeline import Pipeline
from sklearn.metrics import multilabel_confusion_matrix, make_scorer

from imblearn.over_sampling import RandomOverSampler 
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler


sns.set_theme(style='darkgrid')
# read in the json files
portfolio = pd.read_json('../data/raw/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../data/raw/profile.json', orient='records', lines=True)
transcript = pd.read_json('../data/raw/transcript.json', orient='records', lines=True)

### Preparing dataset to be used to train classifier

In [2]:
# Get offer ids from 'value' column, convert to float, and store in new column
offer_ids = dict()
indx = list(transcript[transcript['event']!='transaction'].index)

for ind in indx: 
    offer_id = list(transcript.iloc[ind]['value'].values())[0]
    offer_ids.update({ind:offer_id})
    
# Make dataframe from dictionary of index, offer_id strings     
offer_id_df = pd.DataFrame.from_dict(offer_ids, orient='index', columns=['offer_ids'])

# Concat transcript_mod and offer_id_df dataframes
transcript_mod = pd.concat([transcript, offer_id_df], axis=1, ignore_index=False)

# rename column 'id' as offer_ids to remain consistent with transcript df
portfolio = portfolio.rename(columns={'id':'offer_ids'})

# merge transcript and portfolio dataframes
transcript_portfolio = transcript_mod.merge(portfolio[['offer_ids', 'offer_type']], on='offer_ids', how='left')

offers = ['bogo', 'discount']
# filter transcript_portfolio to get transcripts corresponding to BOGO offers 
offer = transcript_portfolio[transcript_portfolio['offer_type'].isin(offers)]

In [3]:
offer.drop(['value','time', 'offer_ids'], axis=1, inplace=True)

offer_per_person = offer.groupby(['person','offer_type'])['event'].value_counts().unstack()

offer_per_person['offer completed'].fillna(0, inplace=True)

offer_per_person['completed_per_view'] = offer_per_person['offer completed']/ offer_per_person['offer viewed']

offer_per_person['completed_per_view'].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Computing responsiveness label for each user

In [4]:
def setting_response(cpv):
    if cpv <= 0.5: 
        return 0 
    else: 
        return 1
    
offer_per_person['responds'] = offer_per_person['completed_per_view'].apply(lambda x:setting_response(x))

In [5]:
offer_df = offer_per_person.unstack()
offer_df = offer_df['responds']
offer_df.dropna(inplace=True)
offer_df = offer_df.reset_index()

### Feature engineering

In [6]:
# rename column 'id' to 'person' to remain consistent with transcript
profile = profile.rename(columns={'id':'person'})
# Remove people was ages > 99
profile = profile[profile['age'] <= 99]

# Remove people with income > 1000000
profile = profile[profile['income'] < 1000000.0]

# Binarise age based on criteria old=1, young=0
def age_group(age):
    if age < 25:
        return '< 25 years'
    
    if age < 35:
        return '25 - 35 years'
    
    if age < 45:
        return '36 - 45 years'
    
    if age < 66: 
        return '46 - 66 years'
    
    else: 
        return '67+ years'

profile['age_group'] = profile['age'].apply(lambda x:age_group(x))

# Binarise income based on criteria rich=1, poor=0
def income_group(income):
    if income < 50001: 
        return 'low income'
    if income < 70001:
        return 'med - low income'
    if income < 90001:
        return 'high - med income'
    else:
        return 'high income'

profile['income_group'] = profile['income'].apply(lambda x:income_group(x))

# select columns needed for training model
profile_subset = profile[['person','income_group','age_group', 'gender']]

# reset index to allow ease concatanation with transformed ohe data
profile_subset = profile_subset.reset_index()

profile_subset.drop('index', axis=1, inplace=True)

In [7]:
profile['income'].isna().sum()

0

In [8]:

offer_per_person_demo = offer_df.merge(profile_subset, left_on='person',right_on='person', how='left').set_index('person')

In [9]:
X = offer_per_person_demo.drop(['bogo', 'discount'], axis=1)
y = offer_per_person_demo[['bogo', 'discount']]

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13009 entries, 0009655768c64bdeb2e877511632db8f to ffff82501cea40309d5fdd7edcca4a07
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   income_group  11349 non-null  object
 1   age_group     11349 non-null  object
 2   gender        11349 non-null  object
dtypes: object(3)
memory usage: 406.5+ KB


### Implementing classifier

In [10]:
# intantiate OHE
categorical_encoder = OneHotEncoder(handle_unknown='ignore')
    
# List categorical variables in to be OHE
categorical_columns = ['income_group','age_group','gender']

preprocessing = ColumnTransformer(
        [
            ('cat', categorical_encoder, categorical_columns),
        ]
    )

In [11]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import multilabel_confusion_matrix


clf = MultiOutputClassifier(RandomForestClassifier())

pipeline = Pipeline(
    [
        ('preprocess', preprocessing),
        ('classifier', clf),
    ]
    
)

### Evaluating classifier using cross validation

In [12]:
def exp_prof_per_cust(y_test, y_pred):
    
    '''
    This function returns the expected profit per user, 
    which is used as a the evaluation metric of the classifier.
    '''
    
    # Defining benefits associated with true positive, true negatives
    b_tp = 10
    b_tn = 0

    # Defining costs (or negative benefits) associated with false postives, false negatives
    b_fp = -1
    b_fn = -10 

    # Calculating probabilities needed to compute expected profit 

    p_bogo = y_test['bogo'].value_counts().sort_index(ascending=True)[1]
    n_bogo = y_test['bogo'].value_counts().sort_index(ascending=True)[0]

    p_disc = y_test['discount'].value_counts().sort_index(ascending=True)[1]
    n_disc = y_test['discount'].value_counts().sort_index(ascending=True)[0]

    prob_p_bogo = p_bogo/(p_bogo + n_bogo)
    prob_n_bogo = n_bogo/(p_bogo + n_bogo) 

    prob_p_disc = p_disc/(p_disc + n_disc)
    prob_n_disc = n_disc/(p_disc + n_disc)

    # Extracting values from confusion matrix 

    tn_bogo, fp_bogo, fn_bogo, tp_bogo = multilabel_confusion_matrix(y_test, y_pred)[0].ravel()
    tn_disc, fp_disc, fn_disc, tp_disc = multilabel_confusion_matrix(y_test, y_pred)[1].ravel()
     
    p_tp_bogo = tp_bogo/p_bogo
    p_tn_bogo = tn_bogo/n_bogo 
    
    p_tp_disc = tp_disc/p_disc
    p_tn_disc = tn_disc/n_disc 

    p_fp_bogo = fp_bogo/n_bogo
    p_fn_bogo = fn_bogo/p_bogo

    p_fp_disc = fp_disc/n_disc
    p_fn_disc = fn_disc/p_disc
    

     # Expected profit
    E_prof_bogo = (prob_p_bogo * (p_tp_bogo * b_tp + p_fn_bogo * b_fn)) + (prob_n_bogo * (p_tn_bogo * b_tn + p_fp_bogo * b_fp))
                                                    
    E_prof_disc = (prob_p_disc * (p_tp_disc * b_tp + p_fn_disc * b_fn)) + (prob_n_disc * (p_tn_disc * b_tn + p_fp_disc * b_fp))

   
    E_prof = E_prof_bogo + E_prof_disc
    #E_prof = E_prof_bogo

    
    return E_prof

In [18]:
# use crossval to eval performance of model before hypertuning
from sklearn.model_selection import cross_validate

class_score = make_scorer(exp_prof_per_cust)

cv_results = cross_validate(pipeline, X, y, cv = 5, scoring = class_score)

In [19]:
cv_results['test_score'].mean()

4.330880749761887

### Tuning classifier

In [18]:
from sklearn.metrics import multilabel_confusion_matrix, make_scorer

from sklearn.model_selection import cross_validate

from sklearn.model_selection import GridSearchCV

class_score = make_scorer(exp_prof_per_cust)


parameters = {'classifier__estimator__n_estimators': [50, 100, 150], 'classifier__estimator__criterion':('gini', 'entropy')}

rs = GridSearchCV(pipeline, param_grid=parameters, cv=3, scoring=class_score)
rs.fit(X,y)

rs.best_params_

{'classifier__estimator__criterion': 'entropy',
 'classifier__estimator__n_estimators': 50}

In [19]:
rs.cv_results_

{'mean_fit_time': array([0.54882987, 1.12302828, 1.76385474, 0.58643174, 1.04682239,
        1.55955871]),
 'std_fit_time': array([0.01515783, 0.0354294 , 0.01790023, 0.0736628 , 0.00938373,
        0.02348126]),
 'mean_score_time': array([0.05415042, 0.11164077, 0.16065399, 0.05494531, 0.09629631,
        0.13540061]),
 'std_score_time': array([0.00045534, 0.0248401 , 0.01871889, 0.00122575, 0.00311176,
        0.00237964]),
 'param_classifier__estimator__criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__estimator__n_estimators': masked_array(data=[50, 100, 150, 50, 100, 150],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier__estimator__criterion': 'gini',
   'classifier__estimator__n_estimators': 50},
  