In [54]:
# Imports

import os
import math
import random
import operator as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math, itertools
import statistics
import json
import hdbscan

from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn import tree, metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics.pairwise import euclidean_distances
from operator import itemgetter
from statistics import mean
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score
from collections import Counter
from matplotlib.ticker import StrMethodFormatter

from scipy.spatial import ConvexHull

def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df

In [55]:
# Pull and filter all calls <= 20.
max_calls = 32
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls)]
mkt_df_filtered = mkt_df_filtered[['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance', 'campaign', 'y']]
print(mkt_df_filtered.shape)

(45173, 10)


In [56]:
# We use the LabelEncoder from SkLearn to transform the following features.
features_to_transform = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'y']
for feature in features_to_transform:
    le = LabelEncoder()
    le.fit(mkt_df_filtered[feature])
    mkt_df_filtered[feature] = le.transform(mkt_df_filtered[feature])

# For obtaining the best hyper-parameters.
print(mkt_df_filtered.head(10))
y = mkt_df_filtered['campaign']
X = mkt_df_filtered.drop(columns=['campaign'])

# Double checking .
kf = KFold(n_splits=5, shuffle=True)
fold_data = []

for train_index, test_index in kf.split(mkt_df_filtered):
    fold_data.append((train_index, test_index))

   job  marital  education  default  housing  loan  age  balance  campaign  y
0    4        1          2        0        1     0   58     2143         1  0
1    9        2          1        0        1     0   44       29         1  0
2    2        1          1        0        1     1   33        2         1  0
3    1        1          3        0        1     0   47     1506         1  0
4   11        2          3        0        0     0   33        1         1  0
5    4        1          2        0        1     0   35      231         1  0
6    4        2          2        0        1     1   28      447         1  0
7    2        0          2        1        1     0   42        2         1  0
8    5        1          0        0        1     0   58      121         1  0
9    9        2          1        0        1     0   43      593         1  0


In [47]:
# Decisoon Tree

for tt_split in fold_data:
    
    train_index = tt_split[0]
    test_index = tt_split[1]
    
    # Pull data for train test split.
    train_df = mkt_df_filtered.iloc[train_index]
    test_df = mkt_df_filtered.iloc[test_index]
    
    # Split train and test dataframes into input and output.
    train_y = train_df['y']
    train_x = train_df.drop(columns=['y'])
    test_y = test_df['y']
    test_x = test_df.drop(columns=['y'])
    
    # max_depths = [i for i in range(1, 21)]
    # Thus far, max depths = 20 gives best results.
    
    mids = [0.5, 1, 2, 5]
    
    results_fold = []
    
    for el in mids:
    
        # Instantiate the Decision Tree Classifier.
        model = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split = 500)
        model.fit(train_x, train_y)
        
        # Obtain predictions for the test set.
        predictions = model.predict(test_x)
        
        # Calls persons based on predications. Note the number of success and calls.
        total_s = 0
        total_c = 0
        t_s = [0]
        t_c = [0]
        
#         for loc, pred in enumerate(predictions):
#             real = test_df.iloc[loc]
#             if pred > real['campaign']:
#                 total_c += int(pred)
#             else:
#                 if real['y'] == 1:
#                     total_s += 1
#                 total_c += int(pred)
#             t_s.append(total_s)
#             t_c.append(total_c)
        
#         results_fold.append(metrics.auc(t_c, t_s))
        
        acc_score = accuracy_score(test_y, predictions)*100
        print("Accuracy is: ", acc_score)
        
    print(results_fold)
    print("\n")

Accuracy is:  87.95863909272849
Accuracy is:  87.95863909272849
Accuracy is:  87.95863909272849
Accuracy is:  87.95863909272849
[]


Accuracy is:  88.3033133199911
Accuracy is:  88.3033133199911
Accuracy is:  88.3033133199911
Accuracy is:  88.3033133199911
[]


Accuracy is:  88.25753363727344
Accuracy is:  88.25753363727344
Accuracy is:  88.25753363727344
Accuracy is:  88.25753363727344
[]


Accuracy is:  87.94618036250417
Accuracy is:  87.94618036250417
Accuracy is:  87.94618036250417
Accuracy is:  87.94618036250417
[]


Accuracy is:  88.12409651951518
Accuracy is:  88.12409651951518
Accuracy is:  88.12409651951518
Accuracy is:  88.12409651951518
[]




In [None]:
# XGBoost

for tt_split in fold_data:
    
    train_index = tt_split[0]
    test_index = tt_split[1]
    
    # Pull data for train test split.
    train_df = mkt_df_filtered.iloc[train_index]
    test_df = mkt_df_filtered.iloc[test_index]
    
    # Split train and test dataframes into input and output.
    train_y = train_df['y']
    train_x = train_df.drop(columns=['y'])
    test_y = test_df['y']
    test_x = test_df.drop(columns=['y'])
    
    # max_depths = [i for i in range(1, 21)]
    # Thus far, max depths = 20 gives best results.
    
    mids = [0.5, 1, 2, 5]
    
    results_fold = []
    
    for el in mids:
    
        # Instantiate the Decision Tree Classifier.
        model = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split = 500)
        model.fit(train_x, train_y)
        
        # Obtain predictions for the test set.
        predictions = model.predict(test_x)
        
        # Calls persons based on predications. Note the number of success and calls.
        total_s = 0
        total_c = 0
        t_s = [0]
        t_c = [0]
        
#         for loc, pred in enumerate(predictions):
#             real = test_df.iloc[loc]
#             if pred > real['campaign']:
#                 total_c += int(pred)
#             else:
#                 if real['y'] == 1:
#                     total_s += 1
#                 total_c += int(pred)
#             t_s.append(total_s)
#             t_c.append(total_c)
        
#         results_fold.append(metrics.auc(t_c, t_s))
        
        acc_score = accuracy_score(test_y, predictions)*100
        print("Accuracy is: ", acc_score)
        
    print(results_fold)
    print("\n")

### Hyper-parameter tuning!

In [58]:
def my_custom_score_func(y_test, y_pred):
    total_s = 0
    total_c = 0
    successes = []
    calls = []
    loc = 0
    for df_ref in y_test.index:
        actual_value = mkt_df_filtered.loc[df_ref]
        if y_pred[loc] <= y_test.iloc[loc]:
            if actual_value['y'] == 1:
                total_s += 1
        total_c += y_pred[loc]
        successes.append(total_s)
        calls.append(total_c)
        loc += 1
    score = metrics.auc(calls, successes)
    return score

custom_score = make_scorer(my_custom_score_func, greater_is_better=True)

model = tree.DecisionTreeClassifier()

dt_opt = GridSearchCV(model,
                   scoring=custom_score,
                   cv=5,
                   param_grid={
                               "criterion": ['gini', 'entropy'],
                               "max_depth": [i for i in range(1, 25)],
                               "min_impurity_decrease": [0.00005,0.0001,0.0002,0.0005,0.001,0.0015,0.002,0.005,0.01]
                              }
    )

dt_opt.fit(X, y)

print("Best Parameters: ")
print(dt_opt.best_params_)
print(dt_opt.best_score_)

Best Parameters: 
{'criterion': 'entropy', 'max_depth': 24, 'min_impurity_decrease': 5e-05}
6567663.6


Best Parameters were: 
{'criterion': 'entropy', 'max_depth': 24, 'min_impurity_decrease': 5e-05}
with area being -> 6567663.6

N.B. This was when we utilized the full range of calls for the output variable .... 1-32.

### Finding the best age groupings! 

In [None]:
# Score function in this case would be essentially the result of the Greedy Approach.
def my_custom_score_func(y_test, y_pred):
    total_s = 0
    total_c = 0
    successes = []
    calls = []
    loc = 0
    for df_ref in y_test.index:
        actual_value = mkt_df_filtered.loc[df_ref]
        if y_pred[loc] <= y_test.iloc[loc]:
            if actual_value['y'] == 1:
                total_s += 1
        total_c += y_pred[loc]
        successes.append(total_s)
        calls.append(total_c)
        loc += 1
    score = metrics.auc(calls, successes)
    return score

custom_score = make_scorer(my_custom_score_func, greater_is_better=True)

model = tree.DecisionTreeClassifier()

dt_opt = GridSearchCV(model,
                   scoring=custom_score,
                   cv=5,
                   param_grid={
                               "criterion": ['gini', 'entropy'],
                               "max_depth": [i for i in range(1,10)]
                               "min_impurity_decrease": [0.00005,0.0001,0.0002,0.0005,0.001,0.0015,0.002,0.005,0.01]
                              }
    )

dt_opt.fit(X, y)

print("Best Parameters: ")
print(dt_opt.best_params_)
print(dt_opt.best_score_)