In [61]:
import pandas as pd
import numpy as np
import json

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier

from modules.binomial import BinomialExperiment #My own class for evaluating binomial split tests
import modules.cleaning as clean
import modules.exploring as ex

import matplotlib.pyplot as plt
import matplotlib.ticker as tck

# read in the json files
offers = pd.read_json('data/portfolio.json', orient='records', lines=True)
users = pd.read_json('data/profile.json', orient='records', lines=True)
transactions = pd.read_json('data/transcript.json', orient='records', lines=True)

# Purpose

Other notebook uses K Means clustering followed by segmentation analysis to come up with groups of Starbucks users most likely to respond to each promotional type.

This time, I want to try classifying users individually based on which offer they're most likely to respond to. As new users come it, SBUX could use the model to determine which offer to send a given user.

# Method

Clean data sets same as last time (using modules.cleaning and modules.exploring).

Dummy-encode users_clean, this time (was one-hot for clustering). Since I'm classifying, I'll dummy-encode to avoid multicollinearity.

Except, when I get transactions_clean, I'll turn it into a multi-outcome array containing a binary variable for each offer (1 if responded to first contact with offer, 0 if not).

# Clean Data

In [2]:
users_clean = clean.clean_users(users)
transactions_clean = clean.clean_transactions(transactions, offers, users_clean)

In [3]:
users_clean.head()

Unnamed: 0,gender,id,income_segment,age_segment
0,Unknown,68be06ca386d4c31939f3a4f0e3dd783,Unknown,Unknown
1,F,0610b486422d4921ae7d2bf64640c50b,Over 80k,41-60
2,Unknown,38fe809add3b4fcf9315a9694bb96ff5,Unknown,Unknown
3,F,78afa995795e4d85b5d9ceeca43f5fef,Over 80k,61-80
4,Unknown,a03223e636434f42ac4c3df47e8bac43,Unknown,Unknown


In [4]:
transactions_clean.head()

Unnamed: 0,person,offer_id,offer_type,offer_response
0,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,informational,0
1,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,informational,0
2,00116118485d4dfda04fdbaba9a87b5c,f19421c1d4aa40978ebb69ca19b0e20d,bogo,0
3,0020c2b971eb4e9188eac86d93036a77,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,1
4,0020c2b971eb4e9188eac86d93036a77,5a8bc65990b245e5a138643cd4eb9837,informational,0


In [5]:
# Get dummies from users_clean
users_dummies = pd.get_dummies(users_clean, columns = ['gender','income_segment','age_segment'], drop_first = True)
users_dummies.head()

Unnamed: 0,id,gender_M,gender_Unknown,income_segment_Under 50k,income_segment_50k-80k,income_segment_Over 80k,age_segment_25-40,age_segment_41-60,age_segment_61-80,age_segment_81 and Over,age_segment_Unknown
0,68be06ca386d4c31939f3a4f0e3dd783,0,1,0,0,0,0,0,0,0,1
1,0610b486422d4921ae7d2bf64640c50b,0,0,0,0,1,0,1,0,0,0
2,38fe809add3b4fcf9315a9694bb96ff5,0,1,0,0,0,0,0,0,0,1
3,78afa995795e4d85b5d9ceeca43f5fef,0,0,0,0,1,0,0,1,0,0
4,a03223e636434f42ac4c3df47e8bac43,0,1,0,0,0,0,0,0,0,1


In [27]:
users_dummies.shape

(16477, 11)

In [22]:
# Turn transactions into multi-output array
transactions_output = transactions_clean.pivot_table(index = 'person',columns = 'offer_id', values = 'offer_response', fill_value = 0).reset_index(drop = False)
# See overall outcome rates for each promotion
# All low. Clear need for intelligent user assignment to promotions
transactions_output.mean() * 100

offer_id
0b1e1539f2cc45b7b9fa7c272da2e1d7     6.839837
2298d6c36e964ae4a3e7e9706d1fb8c2    21.976088
2906b810c7d4411798c6938adc9daaa5    10.493415
3f207df678b143eea3cee63160fa8bed     0.000000
4d5c57ea9a6940dd891ad53e9dbe8da0    13.685744
5a8bc65990b245e5a138643cd4eb9837     0.000000
9b98b8c7a33c4b65b9aebfe6a799e6d9    10.578382
ae264e3637204a6fb9bb56bc8210ddfd    13.115252
f19421c1d4aa40978ebb69ca19b0e20d    17.436427
fafdcd668e3743c1bb461111dcafc2a4    23.632943
dtype: float64

In [23]:
transactions_output.head()

offer_id,person,0b1e1539f2cc45b7b9fa7c272da2e1d7,2298d6c36e964ae4a3e7e9706d1fb8c2,2906b810c7d4411798c6938adc9daaa5,3f207df678b143eea3cee63160fa8bed,4d5c57ea9a6940dd891ad53e9dbe8da0,5a8bc65990b245e5a138643cd4eb9837,9b98b8c7a33c4b65b9aebfe6a799e6d9,ae264e3637204a6fb9bb56bc8210ddfd,f19421c1d4aa40978ebb69ca19b0e20d,fafdcd668e3743c1bb461111dcafc2a4
0,0009655768c64bdeb2e877511632db8f,0,0,0,0,0,0,0,0,0,0
1,00116118485d4dfda04fdbaba9a87b5c,0,0,0,0,0,0,0,0,0,0
2,0020c2b971eb4e9188eac86d93036a77,0,0,0,0,1,0,0,0,0,1
3,0020ccbbb6d84e358d3414a3ff76cffd,0,1,0,0,0,0,1,0,1,0
4,003d66b6608740288d6cc97a6903f4f0,0,0,0,0,0,0,0,0,0,1


In [26]:
transactions_output.shape

(16477, 11)

In [25]:
# Filter user data set for IDs appearing in transactions_output 
# (300 extra people in users, probably those who only completed transaction event)
users_dummies = users_dummies.loc[users_dummies.id.isin(transactions_output.person)]

In [31]:
# Sort both users_dummies and transactions_output by id/person
users_dummies = users_dummies.sort_values(by = 'id').reset_index(drop = True)
transactions_output = transactions_output.sort_values(by = 'person').reset_index(drop = True)

In [32]:
users_dummies.head()

Unnamed: 0,id,gender_M,gender_Unknown,income_segment_Under 50k,income_segment_50k-80k,income_segment_Over 80k,age_segment_25-40,age_segment_41-60,age_segment_61-80,age_segment_81 and Over,age_segment_Unknown
0,0009655768c64bdeb2e877511632db8f,1,0,0,1,0,1,0,0,0,0
1,00116118485d4dfda04fdbaba9a87b5c,0,1,0,0,0,0,0,0,0,1
2,0020c2b971eb4e9188eac86d93036a77,0,0,0,0,1,0,1,0,0,0
3,0020ccbbb6d84e358d3414a3ff76cffd,0,0,0,1,0,0,0,0,0,0
4,003d66b6608740288d6cc97a6903f4f0,0,0,0,1,0,1,0,0,0,0


In [33]:
transactions_output.head()

offer_id,person,0b1e1539f2cc45b7b9fa7c272da2e1d7,2298d6c36e964ae4a3e7e9706d1fb8c2,2906b810c7d4411798c6938adc9daaa5,3f207df678b143eea3cee63160fa8bed,4d5c57ea9a6940dd891ad53e9dbe8da0,5a8bc65990b245e5a138643cd4eb9837,9b98b8c7a33c4b65b9aebfe6a799e6d9,ae264e3637204a6fb9bb56bc8210ddfd,f19421c1d4aa40978ebb69ca19b0e20d,fafdcd668e3743c1bb461111dcafc2a4
0,0009655768c64bdeb2e877511632db8f,0,0,0,0,0,0,0,0,0,0
1,00116118485d4dfda04fdbaba9a87b5c,0,0,0,0,0,0,0,0,0,0
2,0020c2b971eb4e9188eac86d93036a77,0,0,0,0,1,0,0,0,0,1
3,0020ccbbb6d84e358d3414a3ff76cffd,0,1,0,0,0,0,1,0,1,0
4,003d66b6608740288d6cc97a6903f4f0,0,0,0,0,0,0,0,0,0,1


# Train Model

In [53]:
# Generate X and y arrays
labels = transactions_output.drop(columns = 'person').columns
X = np.asarray(users_dummies.drop(columns = 'id'))
y = np.asarray(transactions_output.drop(columns = 'person'))

In [54]:
labels

Index(['0b1e1539f2cc45b7b9fa7c272da2e1d7', '2298d6c36e964ae4a3e7e9706d1fb8c2',
       '2906b810c7d4411798c6938adc9daaa5', '3f207df678b143eea3cee63160fa8bed',
       '4d5c57ea9a6940dd891ad53e9dbe8da0', '5a8bc65990b245e5a138643cd4eb9837',
       '9b98b8c7a33c4b65b9aebfe6a799e6d9', 'ae264e3637204a6fb9bb56bc8210ddfd',
       'f19421c1d4aa40978ebb69ca19b0e20d', 'fafdcd668e3743c1bb461111dcafc2a4'],
      dtype='object', name='offer_id')

In [49]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [84]:
y_test[:,9].sum()

1171

In [50]:
# Set up the multioutput classifier model we will try and optimize with GridSearchCV
forest = RandomForestClassifier(n_estimators = 100,
                               min_samples_split = 2,
                               min_samples_leaf = 1,
                               max_features = 'auto')

model = MultiOutputClassifier(estimator = forest)

In [64]:
# Run the model with estimator defaults
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [65]:
# Get accuracy for default estimator run
(y_pred == y_test).mean()

0.8826051779935276

**Verdict**: High accuracy, but only because response rates are low for every offer (lots of 0s). Looking at confusion matrices below show that true positive rates are 0% for all offers.

In [68]:
# Print confusion matrices for each offer id
for i in range(len(labels)):
    print('|| ' + labels[i] + ' ||')
    print(confusion_matrix(y_test[:,i], y_pred[:,i]))
    print('\n')

|| 0b1e1539f2cc45b7b9fa7c272da2e1d7 ||
[[4610    0]
 [ 334    0]]


|| 2298d6c36e964ae4a3e7e9706d1fb8c2 ||
[[3885    0]
 [1059    0]]


|| 2906b810c7d4411798c6938adc9daaa5 ||
[[4434    0]
 [ 510    0]]


|| 3f207df678b143eea3cee63160fa8bed ||
[[4944]]


|| 4d5c57ea9a6940dd891ad53e9dbe8da0 ||
[[4250    0]
 [ 694    0]]


|| 5a8bc65990b245e5a138643cd4eb9837 ||
[[4944]]


|| 9b98b8c7a33c4b65b9aebfe6a799e6d9 ||
[[4421    0]
 [ 523    0]]


|| ae264e3637204a6fb9bb56bc8210ddfd ||
[[4309    0]
 [ 635    0]]


|| f19421c1d4aa40978ebb69ca19b0e20d ||
[[4064    2]
 [ 876    2]]


|| fafdcd668e3743c1bb461111dcafc2a4 ||
[[3773    0]
 [1171    0]]


