In [65]:
import pandas as pd
import numpy as np
import json

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB

from modules.binomial import BinomialExperiment #My own class for evaluating binomial split tests
import modules.cleaning as clean
import modules.exploring as ex
from modules.classify import score_multioutput

import matplotlib.pyplot as plt
import matplotlib.ticker as tck

# read in the json files
offers = pd.read_json('data/portfolio.json', orient='records', lines=True)
users = pd.read_json('data/profile.json', orient='records', lines=True)
transactions = pd.read_json('data/transcript.json', orient='records', lines=True)

# Purpose

Other notebook uses K Means clustering followed by segmentation analysis to come up with groups of Starbucks users most likely to respond to each promotional type.

This time, I want to try classifying users individually based on which offer they're most likely to respond to. As new users come it, SBUX could use the model to determine which offer to send a given user.

# Method

Clean data sets same as last time (using modules.cleaning and modules.exploring).

Dummy-encode users_clean, this time (was one-hot for clustering). Since I'm classifying, I'll dummy-encode to avoid multicollinearity.

Except, when I get transactions_clean, I'll turn it into a multi-outcome array containing a binary variable for each offer (1 if responded to first contact with offer, 0 if not).

# Strategy

First tried multioutput classification: categorical demo predictors and multi-outcome array of all offers.

Problem with that approach is first receipt response rate of individual offers is super low. Like 20% is typical. So algo gets 90% accuracy easy for the dumb reason: most of training and test sets are non-response 0. Easy to get high accuracy when the dataset is so imbalanced.

Second try will see if outcome rate to *any* offers of an *offer type* have high response rates. Instead of 9-output target array, will get a 2-output target array (discount and BOGO).

# Clean Data

In [2]:
users_clean = clean.clean_users(users)
transactions_clean = clean.clean_transactions(transactions, offers, users_clean)

In [3]:
users_clean.head()

Unnamed: 0,gender,id,income_segment,age_segment
0,Unknown,68be06ca386d4c31939f3a4f0e3dd783,Unknown,Unknown
1,F,0610b486422d4921ae7d2bf64640c50b,Over 80k,41-60
2,Unknown,38fe809add3b4fcf9315a9694bb96ff5,Unknown,Unknown
3,F,78afa995795e4d85b5d9ceeca43f5fef,Over 80k,61-80
4,Unknown,a03223e636434f42ac4c3df47e8bac43,Unknown,Unknown


In [4]:
transactions_clean.head()

Unnamed: 0,person,offer_id,offer_type,offer_response
0,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,informational,0
1,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,informational,0
2,00116118485d4dfda04fdbaba9a87b5c,f19421c1d4aa40978ebb69ca19b0e20d,bogo,0
3,0020c2b971eb4e9188eac86d93036a77,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,1
4,0020c2b971eb4e9188eac86d93036a77,5a8bc65990b245e5a138643cd4eb9837,informational,0


In [5]:
# Get dummies from users_clean
users_dummies = pd.get_dummies(users_clean, columns = ['gender','income_segment','age_segment'], drop_first = True)
users_dummies.head()

Unnamed: 0,id,gender_M,gender_Unknown,income_segment_Under 50k,income_segment_50k-80k,income_segment_Over 80k,age_segment_25-40,age_segment_41-60,age_segment_61-80,age_segment_81 and Over,age_segment_Unknown
0,68be06ca386d4c31939f3a4f0e3dd783,0,1,0,0,0,0,0,0,0,1
1,0610b486422d4921ae7d2bf64640c50b,0,0,0,0,1,0,1,0,0,0
2,38fe809add3b4fcf9315a9694bb96ff5,0,1,0,0,0,0,0,0,0,1
3,78afa995795e4d85b5d9ceeca43f5fef,0,0,0,0,1,0,0,1,0,0
4,a03223e636434f42ac4c3df47e8bac43,0,1,0,0,0,0,0,0,0,1


In [6]:
users_dummies.shape

(16788, 11)

In [7]:
# Group transactions by person and offer_type, agg by max
# Will give me response rate to ANY offer of a type per person
# Should give me a more balanced dataset (response rates closer to 50% than 20% I got checking individual offer response rates)
transactions_output = transactions_clean.groupby(['person','offer_type'], as_index = False)['offer_response'].agg('max').pivot_table(index = 'person', columns = 'offer_type', values = 'offer_response', fill_value = 0).reset_index(drop = False)
transactions_output.drop(columns = 'informational', inplace = True)
transactions_output.mean()

offer_type
bogo        0.418462
discount    0.488863
dtype: float64

**Notice:** Aggregated response rates are 41% and 48%. WAY more balanced data set than outcome rates by individual offer. Maybe algo will train better, this way.

Response rates here are number of people who responded to first receipt of *any* offer of the listed type.

In [8]:
transactions_output.head()

offer_type,person,bogo,discount
0,0009655768c64bdeb2e877511632db8f,0,0
1,00116118485d4dfda04fdbaba9a87b5c,0,0
2,0020c2b971eb4e9188eac86d93036a77,1,1
3,0020ccbbb6d84e358d3414a3ff76cffd,1,1
4,003d66b6608740288d6cc97a6903f4f0,0,1


In [9]:
transactions_output.shape

(16477, 3)

In [10]:
# Filter user data set for IDs appearing in transactions_output 
# (300 extra people in users, probably those who only completed transaction event)
users_dummies = users_dummies.loc[users_dummies.id.isin(transactions_output.person)]

In [11]:
# Sort both users_dummies and transactions_output by id/person
users_dummies = users_dummies.sort_values(by = 'id').reset_index(drop = True)
transactions_output = transactions_output.sort_values(by = 'person').reset_index(drop = True)

In [12]:
users_dummies.head()

Unnamed: 0,id,gender_M,gender_Unknown,income_segment_Under 50k,income_segment_50k-80k,income_segment_Over 80k,age_segment_25-40,age_segment_41-60,age_segment_61-80,age_segment_81 and Over,age_segment_Unknown
0,0009655768c64bdeb2e877511632db8f,1,0,0,1,0,1,0,0,0,0
1,00116118485d4dfda04fdbaba9a87b5c,0,1,0,0,0,0,0,0,0,1
2,0020c2b971eb4e9188eac86d93036a77,0,0,0,0,1,0,1,0,0,0
3,0020ccbbb6d84e358d3414a3ff76cffd,0,0,0,1,0,0,0,0,0,0
4,003d66b6608740288d6cc97a6903f4f0,0,0,0,1,0,1,0,0,0,0


In [13]:
transactions_output.head()

offer_type,person,bogo,discount
0,0009655768c64bdeb2e877511632db8f,0,0
1,00116118485d4dfda04fdbaba9a87b5c,0,0
2,0020c2b971eb4e9188eac86d93036a77,1,1
3,0020ccbbb6d84e358d3414a3ff76cffd,1,1
4,003d66b6608740288d6cc97a6903f4f0,0,1


# Train Model

In [14]:
# Generate X and y arrays
labels = transactions_output.drop(columns = 'person').columns
X = np.asarray(users_dummies.drop(columns = 'id'))
y = np.asarray(transactions_output.drop(columns = 'person'))

In [15]:
labels

Index(['bogo', 'discount'], dtype='object', name='offer_type')

In [16]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [78]:
# Set up the multioutput classifier model we will try and optimize with GridSearchCV
best_forest = RandomForestClassifier(n_estimators = 20,
                                max_depth = 50,
                                min_samples_split = 0.1,
                                min_samples_leaf = 0.1,
                                bootstrap = False,
                                max_features = 'auto')

best_bayes = MultinomialNB(alpha = 0,
                     fit_prior = True)

bayes_params = {'estimator__alpha':[0,1,2,10,20,50,100],
              'estimator__fit_prior':[True, False]}

forest_params = {'estimator__n_estimators':[10,20,30,40,50,75,100],
                'estimator__max_depth':[None, 5, 10, 50, 75, 100],
                'estimator__min_samples_split':[0.1,0.2,0.3,0.5],
                'estimator__min_samples_leaf':[0.1,0.2,0.3,0.5],
                'estimator__bootstrap':[True, False],
                'estimator__max_features':['auto','sqrt','log2']}

model_params = {'estimator': [best_bayes, best_forest]}

model = MultiOutputClassifier(estimator = best_bayes)
# Optimize for precision. We want to make sure we can take all of our trues seriously.
# Because we want to help SBUX minimize wasted coverage
# Media mix includes email (dirt cheap), but also social and other ad media that can be expensive CPM
# Better to miss some real trues than to target wastefully some fake trues by mistake
clf = GridSearchCV(estimator = model, param_grid = model_params, verbose = 2, scoring = 'accuracy')

In [79]:
# Run the model with estimator defaults
clf.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END ...................estimator=MultinomialNB(alpha=0); total time=   0.0s
[CV] END ...................estimator=MultinomialNB(alpha=0); total time=   0.0s
[CV] END ...................estimator=MultinomialNB(alpha=0); total time=   0.0s
[CV] END ...................estimator=MultinomialNB(alpha=0); total time=   0.0s
[CV] END ...................estimator=MultinomialNB(alpha=0); total time=   0.0s
[CV] END estimator=RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=0.1,
                       min_samples_split=0.1, n_estimators=20); total time=   0.0s
[CV] END estimator=RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=0.1,
                       min_samples_split=0.1, n_estimators=20); total time=   0.0s




[CV] END estimator=RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=0.1,
                       min_samples_split=0.1, n_estimators=20); total time=   0.0s
[CV] END estimator=RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=0.1,
                       min_samples_split=0.1, n_estimators=20); total time=   0.0s
[CV] END estimator=RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=0.1,
                       min_samples_split=0.1, n_estimators=20); total time=   0.0s




GridSearchCV(estimator=MultiOutputClassifier(estimator=MultinomialNB(alpha=0)),
             param_grid={'estimator': [MultinomialNB(alpha=0),
                                       RandomForestClassifier(bootstrap=False,
                                                              max_depth=50,
                                                              min_samples_leaf=0.1,
                                                              min_samples_split=0.1,
                                                              n_estimators=20)]},
             scoring='accuracy', verbose=2)

In [80]:
clf.best_estimator_

MultiOutputClassifier(estimator=MultinomialNB(alpha=0))

In [81]:
clf.best_score_

0.3916585180815881

In [82]:
clf.best_params_

{'estimator': MultinomialNB(alpha=0)}

In [83]:
# Predict y_test using X_test
y_pred = clf.predict(X_test)

In [84]:
# Get accuracy for default estimator run
(y_pred == y_test).mean()

0.602953074433657

**Verdict**: Decent accuracy considering data set balance (41/59 and 48/52). Confusion matrices also look better. True positive rate is non-zero. Nice.

Let's get precision and recall.

**Precision:** True Positives / Total Positives
- How many of my positive predictions were true?

**Recall:** True Positive / (True Pos + False Neg)
- How many of the actual positives did I predict?

**Confusion Matrix:**

TrueNeg, FalsePos

FalseNeg, TruePos

In [50]:
# Score the model
score_multioutput(y_pred, y_test, labels)

{'bogo': {'precision': 0.5626410835214447, 'recall': 0.4718409843823947},
 'discount': {'precision': 0.5703448275862069, 'recall': 0.6854537919602155}}

In [85]:
score_multioutput(y_pred, y_test, labels)

{'bogo': {'precision': 0.5410447761194029, 'recall': 0.6862281116895409},
 'discount': {'precision': 0.5603557814485387, 'recall': 0.7310401989225032}}