In [50]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import preprocessing
#from statsmodels.api import datasets
from sklearn import datasets ## Get dataset from sklearn
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr
import os
%matplotlib inline

In [51]:
os.getcwd()

'C:\\Users\\Sam Cannon\\Desktop\\Python\\Data Sets'

In [52]:
os.chdir('C:\\Users\\Sam Cannon\\Desktop\\Python\\Data Sets')

In [53]:
train = pd.read_csv('AdvWorksCusts.csv')
labels = pd.read_csv('AW_BikeBuyer.csv')
test = pd.read_csv('AW_test.csv')

In [54]:
#look for missing values
train.isna().sum()

CustomerID                  0
Title                   16431
FirstName                   0
MiddleName               6985
LastName                    0
Suffix                  16517
AddressLine1                0
AddressLine2            16243
City                        0
StateProvinceName           0
CountryRegionName           0
PostalCode                  0
PhoneNumber                 0
BirthDate                   0
Education                   0
Occupation                  0
Gender                      0
MaritalStatus               0
HomeOwnerFlag               0
NumberCarsOwned             0
NumberChildrenAtHome        0
TotalChildren               0
YearlyIncome                0
dtype: int64

In [55]:
#just drop all of these columns
train.drop('Title', axis=1, inplace=True)

In [56]:
train.drop('MiddleName', axis=1, inplace=True)

In [57]:
train.drop('Suffix', axis=1, inplace=True)

In [58]:
train.drop('AddressLine2', axis=1, inplace=True)

In [59]:
#check to see if these are gone
train.isna().sum()

CustomerID              0
FirstName               0
LastName                0
AddressLine1            0
City                    0
StateProvinceName       0
CountryRegionName       0
PostalCode              0
PhoneNumber             0
BirthDate               0
Education               0
Occupation              0
Gender                  0
MaritalStatus           0
HomeOwnerFlag           0
NumberCarsOwned         0
NumberChildrenAtHome    0
TotalChildren           0
YearlyIncome            0
dtype: int64

In [60]:
#cool, so our data is "clean"

In [61]:
#recode for gender and marital status to reduce amount of dummy variable columns, these can simply be recoded into binaries
train['gender_recoded'] = train.Gender.map({'M':1, 'F':0})
train['maritalstatus_recoded'] = train.MaritalStatus.map({'M':1, 'S':0})

In [62]:
#dummy code for education, occupation and country
train_dummies = pd.get_dummies(train, columns=[ 'Education', 'Occupation', 'CountryRegionName'], 
                                        prefix=['education', 'occupation', 'country'], drop_first=True)

In [63]:
#look to see if our dummy columns made it
train_dummies.columns

Index(['CustomerID', 'FirstName', 'LastName', 'AddressLine1', 'City',
       'StateProvinceName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Gender',
       'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
       'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome',
       'gender_recoded', 'maritalstatus_recoded', 'education_Graduate Degree',
       'education_High School', 'education_Partial College',
       'education_Partial High School', 'occupation_Management',
       'occupation_Manual', 'occupation_Professional',
       'occupation_Skilled Manual', 'country_Canada', 'country_France',
       'country_Germany', 'country_United Kingdom', 'country_United States'],
      dtype='object')

In [64]:
train_dummies.CustomerID.count()

16519

In [65]:
#define our features (transform into an array)
features = np.array(train_dummies[[
        'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome','TotalChildren'
       ,'YearlyIncome', 'gender_recoded', 'maritalstatus_recoded'
       ,'education_Graduate Degree', 'education_High School', 'education_Partial College', 'education_Partial High School'
       ,'occupation_Management', 'occupation_Manual', 'occupation_Professional','occupation_Skilled Manual'
       ,'country_Canada','country_France', 'country_Germany', 'country_United Kingdom'
       ,'country_United States']])

In [69]:
#define labels (transform into an array)
labels = np.array(labels.BikeBuyer)

In [70]:
#labels are the same shape as our features, good
labels.shape

(16519,)

In [71]:
#scale our features
scale = preprocessing.StandardScaler()
scale.fit(features)
features = scale.transform(features)
features



array([[ 0.69564375, -1.31790504, -0.65479049, ..., -0.32612253,
        -0.34036314, -0.85605071],
       [-1.4375174 , -0.43988635,  1.32540431, ..., -0.32612253,
        -0.34036314, -0.85605071],
       [ 0.69564375, -0.43988635,  1.32540431, ..., -0.32612253,
        -0.34036314, -0.85605071],
       ...,
       [ 0.69564375,  0.43813233, -0.65479049, ..., -0.32612253,
        -0.34036314, -0.85605071],
       [ 0.69564375,  0.43813233, -0.65479049, ..., -0.32612253,
        -0.34036314, -0.85605071],
       [ 0.69564375,  0.43813233, -0.65479049, ..., -0.32612253,
        -0.34036314,  1.16815509]])

In [72]:
#now we are ready to begin creating our boosted decision tree Adaboost, we need to define the cross validation first, 
#inside and outside folds for best hyperparameters and model selection
nr.seed(123)
inside = ms.KFold(n_splits=10, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=10, shuffle = True)

In [73]:
## Define the dictionary for the grid search and the model object to search on
param_grid = {"learning_rate": [0.1, 1, 10]}
## Define the AdaBoosted tree model
nr.seed(3456)
ab_clf = AdaBoostClassifier()  

## Perform the grid search over the parameters
nr.seed(4455)
ab_clf = ms.GridSearchCV(estimator = ab_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
ab_clf.fit(features, labels)
print(ab_clf.best_estimator_.learning_rate)

1


In [74]:
#running the outer cross validation of the model
nr.seed(498)
cv_estimate = ms.cross_val_score(ab_clf, features, labels, 
                                 cv = outside) # Use the outside folds

print('Mean performance metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean performance metric = 0.835
SDT of the metric       = 0.006
Outcomes by cv fold
Fold  1    0.827
Fold  2    0.842
Fold  3    0.835
Fold  4    0.838
Fold  5    0.834
Fold  6    0.833
Fold  7    0.840
Fold  8    0.834
Fold  9    0.824
Fold 10    0.842


In [None]:
#results look really promising for this model

In [75]:
## Randomly sample cases to create independent training and test data, this is so we can test the model
nr.seed(1115)
indx = range(features.shape[0])
indx = ms.train_test_split(indx, test_size = .3)
X_train = features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
X_test = features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [76]:
#define the boosting model using the optimal hyperparameters
nr.seed(1115)
ab_mod = AdaBoostClassifier(learning_rate = ab_clf.best_estimator_.learning_rate) 
ab_mod.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None)

In [77]:
#we can see the hyperparameters match the optimal ones found

In [78]:
#lets get a confusion matrix for the model and score it now
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = ab_mod.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5) 

                 Confusion matrix
                 Score positive    Score negative
Actual positive      2927               399
Actual negative       678               952

Accuracy        0.78
AUC             0.83
Macro precision 0.76
Macro recall    0.73
 
           Positive      Negative
Num case     3326          1630
Precision    0.81          0.70
Recall       0.88          0.58
F1           0.84          0.64


In [79]:
#now lets import the test set and clean, dummify, and scale it so we can predict new values
test.drop('Title', axis=1, inplace=True)

In [80]:
test.drop('MiddleName', axis=1, inplace=True)

In [81]:
test.drop('Suffix', axis=1, inplace=True)

In [82]:
test.drop('AddressLine2', axis=1, inplace=True)

In [83]:
#recode for gender and marital status
test['gender_recoded'] = test.Gender.map({'M':1, 'F':0})
test['maritalstatus_recoded'] = test.MaritalStatus.map({'M':1, 'S':0})

In [84]:
#dummy code for education, occupation and country
test_dummies = pd.get_dummies(test, columns=[ 'Education', 'Occupation', 'CountryRegionName'], 
                                        prefix=['education', 'occupation', 'country'], drop_first=True)

In [85]:
#define our features
features2 = np.array(test_dummies[[
        'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome','TotalChildren'
       ,'YearlyIncome', 'gender_recoded', 'maritalstatus_recoded'
       ,'education_Graduate Degree', 'education_High School', 'education_Partial College', 'education_Partial High School'
       ,'occupation_Management', 'occupation_Manual', 'occupation_Professional','occupation_Skilled Manual'
       ,'country_Canada','country_France', 'country_Germany', 'country_United Kingdom'
       ,'country_United States']])

In [86]:
#scale our features
scale = preprocessing.StandardScaler()
scale.fit(features2)
features2 = scale.transform(features2)
features2



array([[-1.41209489,  0.35928859, -0.72107754, ..., -0.32961713,
        -0.31063037,  1.1055416 ],
       [ 0.70816771,  0.35928859,  0.53297036, ..., -0.32961713,
        -0.31063037, -0.90453403],
       [ 0.70816771,  0.35928859, -0.72107754, ..., -0.32961713,
        -0.31063037,  1.1055416 ],
       ...,
       [-1.41209489,  2.12050718,  1.15999431, ..., -0.32961713,
        -0.31063037,  1.1055416 ],
       [ 0.70816771, -0.5213207 , -0.72107754, ..., -0.32961713,
        -0.31063037,  1.1055416 ],
       [-1.41209489,  0.35928859, -0.72107754, ..., -0.32961713,
        -0.31063037, -0.90453403]])

In [87]:
boosted_predictions = ab_mod.predict(features2)

In [88]:
boosted_predictions

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,

In [None]:
#sweet! write this to a csv and lets go
boosted_predictionsdf = pd.DataFrame(data=boosted_predictions)

In [None]:
boosted_predictionsdf.count()

In [None]:
boosted_predictionsdf.reset_index()


In [None]:
cid = pd.DataFrame(data=test['CustomerID'])
cid.reset_index()

In [None]:
boosted_predictions_with_cid = pd.merge(cid, boosted_predictionsdf, left_index=True, right_index=True)

In [None]:
boosted_predictions_with_cid

In [None]:
boosted_predictions_with_cid.to_csv('boosted_predictions_with_cid.csv')