In [26]:
# Imports

import os
import math
import random
import operator as op
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math, itertools
import statistics
import json
import hdbscan

# Sklearn imports
from xgboost import XGBClassifier
from sklearn import tree, metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics.pairwise import euclidean_distances
from operator import itemgetter
from statistics import mean
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
from matplotlib.ticker import StrMethodFormatter

from scipy.spatial import ConvexHull

In [36]:
class Succ_Call_Tracker():
    def __init__(self):
        self.calls = [0]
        self.succ = [0]
        self.curr_s = 0
        self.curr_c = 0
        
    def add_calls(self, num_calls):
        self.curr_c += num_calls
        
    def add_succ(self):
        self.curr_s += 1
    
    # The following TWO methods might not be useful ..
    def add_call_save(self, num_calls):
        self.add_calls(int(num_calls))
        self.calls.append(self.curr_c)
    def add_succ_save(self):
        self.add_succ()
        self.succ.append(self.curr_s)
        
    def save(self):
        self.calls.append(self.curr_c)
        self.succ.append(self.curr_s)
        
    def get_calls(self):
        return self.curr_c
        
    def get_auc(self):
        return metrics.auc(self.calls, self.succ)
    
    def get_succ_calls_score(self):
        return (self.calls, self.succ, self.get_auc())

In [37]:
t = Succ_Call_Tracker()
t.add_calls(10)
t.add_succ()
t.save()
t.add_call_save(45)
t.add_succ_save()
print(t.get_succ_calls_score())

([0, 10, 55], [0, 1, 2], 72.5)


In [38]:
# Helper Functions Across All Methods

def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df
  
    
def plot_graph_new(results, max_calls, list_passed, title, name = "1"):
    x_pts = [i+1 for i in range(0, max_calls)]
    if list_passed:
        y_pts = results
    else:    
        y_pts = [results[i]['expected'] for i in range(0, max_calls)]
    print(y_pts)
    plt.title(title)
    plt.plot(x_pts, y_pts, linewidth=2)
    plt.xlabel("Call Number")
    plt.ylabel("Success Per Call Rate")
    plt.ylim(0, 0.4)
    plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.3f}'))
#     plt.axvline(x=0, color ="black", linewidth=1)
#     plt.axhline(y=0, color ="black", linewidth=1)
    plt.xticks(np.arange(1, max_calls+1, 1))
#     plt.show()
    plt.savefig(str(name) + ".pdf")
    plt.close()
    

def div(a,b):
    if int(b) == 0:
        return 0.0
    else:
        return a/b
    

# Used for creating all possible combinations of the features.
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = iterable
    return itertools.chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


def convert(list): 
    return tuple(list) 


def construct_dict(feature_comb):
    new_dict = {}
    new_dict['education'] = convert(feature_comb[0])
    new_dict['job'] = convert(feature_comb[2])
    new_dict['marital'] = convert(feature_comb[1])
    new_dict['default'] = convert(feature_comb[3])
    new_dict['loan'] = convert(feature_comb[4])
    new_dict['housing'] = convert(feature_comb[5])
    return new_dict


# This is the new metric (success per call rate).
def compute_expected_succ_per_call_rate_feature_set(fs_df, no_calls_considered):
    expected_values_call_nums = []
    for i in range(1, no_calls_considered + 1):
        expected_values_call_nums.append({'succ':0, 'total_calls':0, 'expected':0.0})
        for index, row in fs_df.iterrows():
            no_calls = row['campaign']
            if no_calls <= i:
                if row['y'] == "yes":
                    expected_values_call_nums[i-1]['succ'] += 1
                expected_values_call_nums[i-1]['total_calls'] += no_calls
            else:
                expected_values_call_nums[i-1]['total_calls'] += i
    for loc, item in enumerate(expected_values_call_nums):
        expected_values_call_nums[loc]['expected'] = div(item['succ'], item['total_calls'])
    return expected_values_call_nums


def compute_optimal_call_no(results):
    max_loc = max(range(len(results)), key=lambda index: results[index]['expected'])
    if max_loc == 0 and results[max_loc]['expected'] == 0.0:
        return -1
    return max_loc


# Given a dictionary of what attributes comprise a feature set, we can get all rows corresponding to this feature set.
def extract_rows_feature_set(fs_df, feature_labels = {'education':['tertiary', 'unknown'], 
                                                      'job':['management', 'technician', 'blue-collar'], 
                                                      'marital':['single'], 'default':['no'], 
                                                      'housing':['no'], 'loan':['no']}):
    for key in feature_labels:
        feature_labels_query_str = ''
        arr = feature_labels[key]
        for label in arr:
            feature_labels_query_str += (key + ' == "'+ label + '" | ')
        feature_labels_query_str = feature_labels_query_str[:-3]
        fs_df = fs_df.query(feature_labels_query_str)
    return fs_df


def find_matching_attribute_comb(row_value, all_combs):
    query = None
    for comb in all_combs:
        for item in comb:
            if item == row_value:
                query = comb
    return query


def compute_metric(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += row['campaign']
    return div(total_successes, total_calls)

def compute_metric_2(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += min(row['campaign'], )
    return div(total_successes, total_calls)


def compute_metric_for_each_attribute(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    for index, value in enumerate(all_values):
        v_query = "{0} == '{1}'".format(attrib, value)
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals


def compute_metric_for_each_attribute_range(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    query_strings = []
    for index, value in enumerate(all_values):
        v_query = "{0} >= {1} & {2} < {3}".format(attrib, value[0], attrib, value[1])
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
        query_strings.append(v_query)
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals, query_strings


def find_combinations(sub_attributes, ratios):
    num_iter = len(ratios)
    sil_scores = []
    # Making use of the K-Means algorithm ... number of centroids are from 2 to n-1.
    for clust_num in range(2, num_iter):
        kmeans = KMeans(n_clusters = clust_num)
        kmeans.fit(ratios.reshape(-1,1))
        results = kmeans.labels_
        sil_scores.append((silhouette_score(ratios.reshape(-1,1), results, metric='euclidean'), results, clust_num))
#     print(sil_scores)
    # We make use of the silhouette score to determine the ideal number of centroids.
    sorted_sil_scores = sorted(sil_scores, key=lambda x: x[0], reverse = True)
    # We then use this ideal number of centroids to determine which sub attributes should be aggregated.
    joined_sub_attributes = []
    for i in range(0, sorted_sil_scores[0][2]):
        joined_sub_attributes.append([])
    join_list = sorted_sil_scores[0][1]
    for index, value in enumerate(join_list):
        pos = join_list[index]
        joined_sub_attributes[pos].append(sub_attributes[index])
    return_joined_sub_attributes = []
    for arr in joined_sub_attributes:
        similar_els_gp = []
        for item in arr:
            similar_els_gp.append(str(item))
        return_joined_sub_attributes.append(similar_els_gp)
#     print(return_joined_sub_attributes)
    return return_joined_sub_attributes

# The following is the format of the way in which this method should be called.
# find_combinations(['a', 'b', 'c', 'd'], np.array([1, 4, 7, 90]), "job").

def find_all_cust_feature_set(fs, df):
    comb = {
        'education':fs[0], 
         'job':fs[1], 
         'marital':fs[2], 
         'default':fs[3], 
         'loan':fs[4], 
         'housing':fs[5]
    }
    res_1 = df.query(fs[6])
    res_2 = res_1.query(fs[7])
    res_final = extract_rows_feature_set(res_2, comb)
    return res_final


def construct_hull_points(results, max_calls):
    pts = []
    for x in range(0, max_calls):
        s = results[x]['succ']
        c = results[x]['total_calls']
        pts.append([c,s])
#     print("Num points is ", len(pts))
    pts = np.array(pts)
    try:
        hull = ConvexHull(pts)
        verts = hull.vertices
#         print(pts)
#         plt.plot(pts[:,0], pts[:,1], 'o')
#         for simplex in hull.simplices:
#             plt.plot(pts[simplex, 0], pts[simplex, 1], 'k-')
        if not np.isin(max_calls - 1, verts):
            verts = np.append(max_calls - 1, verts)
        verts = np.sort(verts)
        return verts.tolist()
    except:
        return False

    
def gradient_update(key, fs_pick):
    fs = fs_pick[key]
    fs_results = fs['results']
    hull_pts = fs['hull_points']
    loc = fs['loc']
    max_loc = fs['max_num_pts']
    grad = 0.0
    if loc <= max_loc:
        if loc == 0:
            grad = div(fs_results[hull_pts[loc]]['succ'], fs_results[hull_pts[loc]]['total_calls'])
        else:
            grad = div(fs_results[hull_pts[loc]]['succ'] - fs_results[hull_pts[loc-1]]['succ'] , fs_results[hull_pts[loc]]['total_calls'] - fs_results[hull_pts[loc-1]]['total_calls'])
        fs_pick[key]['grad'] = grad
    else:
        fs_pick[key]['finished'] = True

        
def get_features(row, feature_names):
    fs = []
    for index, val in enumerate(feature_names):
        if int(row[index]) == 1:
            fs.append(val)
    return fs

In [39]:
# This cell holds functions that are utilized by each of the methods defined.

def group_age(row, age_ranges):
#     print(age_ranges)
    age = int(row['age'])
    age_val = None
    for index, age_range in enumerate(age_ranges):
        if op.ge(age, age_range[0]) and op.le(age, age_range[1]):
            age_val = index + 1
    if age_val == None:
        print("Failed Assignment for age: ", age)
#         mkt_df_filtered_kmeans.loc[loc, 'age'] = age_val
    return age_val
        

def group_balance(row, balance_ranges):
#     print(balance_ranges)
    bal = int(row['balance'])
    bal_val = None
    for index, balance_range in enumerate(balance_ranges):
        if op.ge(bal, balance_range[0]) and op.le(bal, balance_range[1]):
            bal_val = index + 1
    if bal_val == None:
        print("Failed Assignment for balance: ", bal)
#         mkt_df_filtered_kmeans.loc[loc, 'balance'] = bal_val
    return bal_val


def group_feature(df, col_name, func, ranges):
    for index, row in df.iterrows():
        df.loc[index, col_name] = func(row, ranges)

def compute_ratio_all_users(df, train_indicies):
    ratio_values = []
    for val in train_indicies:
        row = df.iloc[val]
        if row['y'] == "yes":
            ratio_values.append((val, div(1, row['campaign'])))
        else:
            ratio_values.append((val, 0.0))
    return ratio_values


def compute_freq_percentage(mappings):
    total = 0
    for user_mapping in mappings.keys():
        total += mappings[user_mapping]['freq']
    for user_mapping in mappings.keys():
        mappings[user_mapping]['percentage'] = div(mappings[user_mapping]['freq'], total)

In [58]:
# Each function represents each method attempted.

def call_everyone(test_df):
    print("Call all Customers Approach")
    call_check_points = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, 23500, 24000, 24500, 25000, 25500, 26000, 26500, 27000, 27500, 28000, 28500, 29000, 29500, 30000, 30500, 31000, 31500, 32000, 32500, 33000, 33500, 34000, 34500, 35000, 35500, 36000, 36500, 37000, 37500, 38000, 38500, 39000, 39500, 40000, 40500, 41000, 41500, 42000, 42500, 43000, 43500, 44000, 44500, 45000, 45500, 46000, 46500, 47000, 47500, 48000, 48500, 49000, 49500, 50000, 50500, 51000, 51500, 52000, 52500, 53000, 53500, 54000, 54500, 55000, 55500, 56000, 56500, 57000, 57500, 58000, 58500, 59000, 59500, 60000, 60500, 61000, 61500, 62000, 62500, 63000, 63500, 64000, 64500, 65000, 65500, 66000, 66500, 67000, 67500, 68000, 68500, 69000, 69500, 70000, 70500, 71000, 71500, 72000, 72500, 73000, 73500, 74000, 74500, 75000, 75500, 76000, 76500, 77000, 77500, 78000, 78500, 79000, 79500, 80000, 80500, 81000, 81500, 82000, 82500, 83000, 83500, 84000, 84500, 85000, 85500, 86000, 86500, 87000, 87500, 88000, 88500, 89000, 89500, 90000, 90500, 91000, 91500, 92000, 92500, 93000, 93500, 94000, 94500, 95000, 95500, 96000, 96500, 97000, 97500, 98000, 98500, 99000, 99500, 100000, 100500, 101000, 101500, 102000, 102500, 103000, 103500, 104000, 104500, 105000, 105500, 106000, 106500, 107000, 107500, 108000, 108500, 109000, 109500, 110000, 110500, 111000, 111500, 112000, 112500, 113000, 113500, 114000, 114500, 115000, 115500, 116000, 116500, 117000, 117500, 118000, 118500, 119000, 119500, 120000, 120500, 121000, 121500, 122000, 122500, 123000, 123500, 124000, 124500, 125000, 125500, 126000, 126500, 127000, 127500, 128000, 128500, 129000, 129500, 130000, 130500, 131000, 131500, 132000, 132500, 133000, 133500, 134000, 134500, 135000, 135500, 136000, 136500, 137000, 137500, 138000, 138500, 139000, 139500]
    sc_tracker = Succ_Call_Tracker()
    cp_loc = 0
    res = test_df.reindex(np.random.permutation(test_df.index))
    for loc, row in res.iterrows():
        if sc_tracker.get_calls() >= call_check_points[cp_loc]:
            cp_loc += 1
            sc_tracker.save()
        sc_tracker.add_calls(row['campaign'])
        if row['y'] == "yes":
            sc_tracker.add_succ()
    sc_tracker.save()
    return sc_tracker.get_succ_calls_score(), sc_tracker.get_calls()


def greedy_approach(combs_to_consider):
    print("Greedy Approach")
    persons_to_call_overall = {k: v for k, v in sorted(combs_to_consider.items(), key=lambda fs: fs[1]['overall_rate'], reverse = True)}
    sc_tracker = Succ_Call_Tracker()
    result_ratios = []
    for key in persons_to_call_overall.keys():
        for loc, cust in persons_to_call_overall[key]['fs_customers'].iterrows():
            sc_tracker.add_calls(cust['campaign'])
            if cust['y'] == "yes":
                sc_tracker.add_succ()
        sc_tracker.save()
    return sc_tracker.get_succ_calls_score()


def convex_hull(fs_pick, num_calls):
    print("Gradient Ascent Approach")
    sc_tracker = Succ_Call_Tracker()
    print("Performing initial update .. ")
    for key in fs_pick.keys():
        gradient_update(key, fs_pick)
    print("Perforiming sort .. ")
    # Sort based on gradient.
    optimal_choices = [(k,v) for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['grad'], reverse = True)]
    # Call best feature set, update gradient for this feature set and re-sort all feature sets.
    # Rinse and repeat!
    print("Finished sort .. in while loop ")
    while(sc_tracker.get_calls() <= num_calls):
        best_loc = 0
        while(best_loc < len(optimal_choices) and optimal_choices[best_loc][1]['finished'] == True):
            best_loc += 1
        if best_loc == len(optimal_choices):
            break
        fs_key = optimal_choices[best_loc][0]
        fs_data = optimal_choices[best_loc][1]
        if fs_data['finished'] == False:
            loc = fs_data['loc']
            if loc == 0:
                call_start = 1
                call_end = fs_data['hull_points'][loc] + 1
            else:
                call_start = fs_data['hull_points'][loc-1] + 2
                call_end = fs_data['hull_points'][loc] + 1
            for call in range(call_start, call_end + 1, 1):
                for loc, row in fs_pick[fs_key]['fs_customers'].iterrows():
                    if row['campaign'] == call:
                        sc_tracker.add_calls(1)
                        if row['y'] == "yes":
                            sc_tracker.add_succ()
                    elif row['campaign'] > call:
                        sc_tracker.add_calls(1)
            sc_tracker.save()
            fs_pick[fs_key]['loc'] += 1
            gradient_update(fs_key, fs_pick)
            optimal_choices = [(k,v) for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['grad'], reverse = True)]
    return sc_tracker.get_succ_calls_score()


def upper_bound(test_df):
    print("Upper Bound Approach")
    sc_tracker = Succ_Call_Tracker()
    res_df = test_df.query("y == 'yes'")
    for x in range(1, max_calls + 1):
        res_df2 = res_df.query("campaign == {0}".format(x))
        num_cust = len(res_df2)
        for i in range(0, num_cust):
            sc_tracker.add_succ()
        sc_tracker.add_calls((num_cust * x))
        sc_tracker.save()
    res_df = test_df.query("y == 'no'")
    for x in range(1, max_calls + 1):
        res_df2 = res_df.query("campaign == {0}".format(x))
        num_cust = len(res_df2)
        sc_tracker.add_calls((num_cust * x))
        sc_tracker.save()
    return sc_tracker.get_succ_calls_score()
    
    
def new_approach_ratio_grouping_percentage(df, train_indicies, test_indicies, group_size, age_groupings, balance_groupings):
    # Ensuring that we can binary encode any row in our dataset. We also group age and balance values 
    # from each row into ranges.
    print("Step 1")
    df_copy = df.copy(deep=True)
    group_feature(df_copy, "age", group_age, age_groupings)
    group_feature(df_copy, "balance", group_balance, balance_groupings)
    # Build the customers for each group.
    print("Step 2")
    ratio_arr = compute_ratio_all_users(df_copy, train_indicies)
    ratio_arr_sorted = sorted(ratio_arr, key=lambda tup: tup[1], reverse = True)
    train_size = len(ratio_arr_sorted)
    groupings = {}
    for loc in range(0, train_size):
        group_key = str(int(loc/group_size))
        if group_key not in groupings.keys():
            groupings[group_key] = {'indicies':[], 'mappings':{}, 'results':None} 
        groupings[group_key]['indicies'].append(ratio_arr_sorted[loc][0])
    print(len(groupings.keys()))
    # For each group, we find the unique feature combinations and store them in a list. 
    # We also store the results - s/c ratio for call numbers from 1-20.
    test_calls = {}
    print("Step 3")
    for group_key in groupings.keys():
        users_df = df_copy.iloc[groupings[group_key]['indicies']]
        mappings = groupings[group_key]['mappings']
        for row in users_df.itertuples():
            user_mapping = str((row.job, row.marital, row.education, row.default,
                               row.housing, row.loan, row.age, row.balance))
            if user_mapping not in mappings.keys():
                mappings[user_mapping] = {'freq':0, 'percentage':0.0}
            else:
                mappings[user_mapping]['freq'] += 1
        groupings[group_key]['results'] = compute_expected_succ_per_call_rate_feature_set(users_df, 20)
        compute_freq_percentage(mappings)
        test_calls[group_key] = {'locs_to_call':[], 'overall_rate':groupings[group_key]['results'][19]}
    # print(test_calls)
    # For the test set, we need to map each user to the most appropriate cluster.
    print("Step 4")
    missed = 0
    for loc in test_indicies:
        row = df_copy.iloc[loc]
        user_mapping = str((row['job'], row['marital'], row['education'], row['default'],
                            row['housing'], row['loan'], row['age'], row['balance']))
        all_groupings_keys = list(groupings.keys())
        best_group_key = None
        best_ratio = -1.0
        for group_key in all_groupings_keys:
            if user_mapping in groupings[group_key]['mappings']:
                if best_group_key is None:
                    best_group_key = group_key
                    best_ratio = groupings[group_key]['mappings'][user_mapping]['percentage']
                else:
                    if groupings[best_group_key]['mappings'][user_mapping]['percentage'] > best_ratio:
                        best_group_key = group_key
                        best_ratio = groupings[group_key]['mappings'][user_mapping]['percentage']
        if best_group_key is not None:
            test_calls[best_group_key]['locs_to_call'].append(loc)
        else:
            missed += 1
    print("We missed:", missed)
    # Call users ... those with the highest ratios are called first.
    test_calls_sorted = sorted(test_calls.items(), key=lambda fs: fs[1]['overall_rate']['expected'], reverse = True)
    print("Step 5")
    num_succ = 0
    num_calls = 0
    result_ratios = []
    for test_call in test_calls_sorted:
        for cust_loc in test_call[1]['locs_to_call']:
            row = df_copy.iloc[cust_loc]
            if row['y'] == "yes":
                num_succ += 1
            num_calls += int(row['campaign'])
        result_ratios.append((num_succ, num_calls))
    return result_ratios, groupings


def clustering_approach_abstracted(clusterer, feature_names, train_df, test_df, train_df_encoded, test_df_encoded):
    groupings = {}
    predictions = clusterer.labels_
    # We assign to each group, the similar indicies. This was based on the clustering approach.
    for index, group in enumerate(predictions):
        if str(group) not in groupings.keys():
            groupings[str(group)] = {'train_indicies':[], 'unique_keys':{}, 'results':None, 'test_indicies':[]}
        groupings[str(group)]['train_indicies'].append(index)
    print("Check 1")
    # For all customers belonging to each grouping, we find the unique keys and compute the success per call
    # ratio for call numbers 1-20.
    for group in groupings.keys():
        for index in groupings[group]['train_indicies']:
            cust_info = train_df.iloc[index]
            cust_features = get_features(train_df_encoded[index], feature_names)
            # cust_features = cust_features[0:8]
            if str(cust_features) not in groupings[group]['unique_keys'].keys():
                groupings[group]['unique_keys'][str(cust_features)] = {'#_ocurr': 1}
            else:
                groupings[group]['unique_keys'][str(cust_features)]['#_ocurr'] += 1
        results = compute_expected_succ_per_call_rate_feature_set(train_df.iloc[groupings[group]['train_indicies']], 20)
        groupings[group]['results'] = results
    print("Check 2")
    # This process makes use of the test set and determines the ideal cluster for a customer.
    for index in range(0, len(test_df_encoded), 1):
        encoded_customer_data = test_df_encoded[index]
        test_labels, strengths = hdbscan.approximate_predict(clusterer, [encoded_customer_data])
        groupings[str(test_labels[0])]['test_indicies'].append(index)
    print("Check 3")
    # Perform sorting of groups based on success per call rate.
    sorted_final_call = {k: v for k, v in sorted(groupings.items(), key=lambda item: item[1]['results'][19]['expected'], reverse = True)}
    print("Check 4")
    # Go about calling customers, keep track of the success per call rate as we switch from group to group.
    total_s = 0
    total_c = 0
    result_ratios = []
    for group in sorted_final_call:
        for cust_index in sorted_final_call[group]['test_indicies']:
            row = test_df.iloc[cust_index]
            if row['y'] == "yes":
                total_s += 1
            total_c += int(row['campaign'])
        result_ratios.append((total_s, total_c))
    return result_ratios, groupings
        
        
# The encoding process can also be varied to not include age and balance.
def clustering_age_balance_grouped(mkt_df_filtered, train_index, test_index, min_cluster_size, balance_groupings, age_groupings):
    print("HDBScan Clustering - Groupings")
    mkt_df_filtered_cp = mkt_df_filtered.copy(deep = True)
    group_feature(mkt_df_filtered_cp, "age", group_age, age_groupings)
    group_feature(mkt_df_filtered_cp, "balance", group_balance, balance_groupings)
    train_df = mkt_df_filtered_cp.iloc[train_index]
    test_df = mkt_df_filtered_cp.iloc[test_index]
    encoder = OneHotEncoder()
    encoder.fit(mkt_df_filtered_cp.drop(columns = ['y', 'campaign']))
    train_df_encoded = encoder.transform(train_df.drop(columns = ['y', 'campaign'])).toarray()
    test_df_encoded = encoder.transform(test_df.drop(columns = ['y', 'campaign'])).toarray()
    feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True)
    clusterer.fit(train_df_encoded)
#     train_df_copy = train_df.copy(deep = True)
#     test_df_copy = test_df.copy(deep = True)
#     group_feature(train_df_copy, "age", group_age, age_groupings)
#     group_feature(train_df_copy, "balance", group_balance, balance_groupings)
# #     print(train_df_copy.head(10))
#     encoder = OneHotEncoder()
#     train_df_encoded = encoder.fit_transform(train_df_copy.drop(columns=['y', 'campaign'])).toarray()
#     clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True)
# #     print(len(train_df_encoded[0]))
#     clusterer.fit(train_df_encoded)
#     group_feature(test_df_copy, "age", group_age, age_groupings)
#     group_feature(test_df_copy, "balance", group_balance, balance_groupings)
#     test_df_encoded = encoder.fit_transform(test_df_copy.drop(columns=['y', 'campaign'])).toarray()
#     feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
    return clustering_approach_abstracted(clusterer, feature_names, train_df, test_df, train_df_encoded, test_df_encoded)


# The encoding process can also be varied to not include age and balance.
def clustering_age_balance_not_grouped(mkt_df_filtered, train_index, test_index, min_cluster_size, balance_groupings, age_groupings):
    print("HDBScan Clustering - No Groupings")
    mkt_df_filtered_cp = mkt_df_filtered.copy(deep = True)
    train_df = mkt_df_filtered_cp.iloc[train_index]
    test_df = mkt_df_filtered_cp.iloc[test_index]
    encoder = OneHotEncoder()
    encoder.fit(mkt_df_filtered_cp.drop(columns = ['y', 'campaign', 'age', 'balance']))
    train_df_encoded = encoder.transform(train_df.drop(columns = ['y', 'campaign', 'age', 'balance'])).toarray()
    train_df_encoded = np.column_stack((train_df_encoded, train_df['age'].to_numpy()))
    train_df_encoded = np.column_stack((train_df_encoded, train_df['balance'].to_numpy()))
    test_df_encoded = encoder.transform(test_df.drop(columns = ['y', 'campaign', 'age', 'balance'])).toarray()
    test_df_encoded = np.column_stack((test_df_encoded, test_df['age'].to_numpy()))
    test_df_encoded = np.column_stack((test_df_encoded, test_df['balance'].to_numpy()))
    feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan'])
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True)
    clusterer.fit(train_df_encoded)
#     train_df_copy = train_df.copy(deep = True)
#     test_df_copy = test_df.copy(deep = True)
#     encoder = OneHotEncoder()
#     train_df_encoded = encoder.fit_transform(train_df_copy.drop(columns=['y', 'campaign', 'age', 'balance'])).toarray()
#     train_df_encoded = np.column_stack((train_df_encoded, train_df_copy['age'].to_numpy()))
#     train_df_encoded = np.column_stack((train_df_encoded, train_df_copy['balance'].to_numpy()))
#     clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True)
#     clusterer.fit(train_df_encoded)
#     test_df_encoded = encoder.fit_transform(test_df_copy.drop(columns=['y', 'campaign', 'age', 'balance'])).toarray()
#     test_df_encoded = np.column_stack((test_df_encoded, test_df_copy['age'].to_numpy()))
#     test_df_encoded = np.column_stack((test_df_encoded, test_df_copy['balance'].to_numpy()))
#     feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan'])
    return clustering_approach_abstracted(clusterer, feature_names, train_df, test_df, train_df_encoded, test_df_encoded)

# def clustering_age_balance_grouped_ratio_gradient_ascent(train_df, test_df, min_cluster_size, balance_groupings, age_groupings, num_calls):
#     print("In P8")
#     group_age(train_df, age_groupings)
#     group_balance(train_df, balance_groupings)
#     train_df, ratio_df = compute_ratio_all_users(train_df)
#     encoder = OneHotEncoder()
#     train_df_encoded = encoder.fit_transform(train_df.drop(columns=['y', 'campaign'])).toarray()
#     train_df_encoded = np.column_stack((train_df_encoded, ratio_df['ratio'].to_numpy()))
#     feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
#     clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
#     clusterer.fit(train_df_encoded)
#     predictions = clusterer.labels_
#     test_df = mkt_df_filtered.iloc[test_index]
#     group_age(test_df, age_groupings)
#     group_balance(test_df, balance_groupings)
#     test_df_encoded = encoder.fit_transform(test_df.drop(columns=['y', 'campaign'])).toarray()
# #     return None, None, (None, None)
#     return abstraction_new_approach_gradient_ascent(predictions, feature_names, train_df, test_df, train_df_encoded, test_df_encoded, num_calls)

def tree_approaches_abstracted(model, mkt_df_filtered_cp, train_index, test_index):
    # Encode Features.
    features_to_transform = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'y']
    for feature in features_to_transform:
        le = LabelEncoder()
        le.fit(mkt_df_filtered_cp[feature])
        mkt_df_filtered_cp[feature] = le.transform(mkt_df_filtered_cp[feature])
    # Split into train/test.
    train_df = mkt_df_filtered_cp.iloc[train_index]
    test_df = mkt_df_filtered_cp.iloc[test_index]
    # Split in to X and y.
    feature_y = "campaign"
    train_y = train_df[feature_y]
    train_X = train_df.drop(columns = [feature_y])
    test_y = test_df[feature_y]
    test_X = test_df.drop(columns = [feature_y])
    # Fit model and get predictions.
    model.fit(train_X, train_y)
    predictions = model.predict(test_X)
    # The rest is the approach to compute points for AUC metric. 
    total_c = 0
    total_s = 0
    results = []
#     associations = []
    j = 0
    for index, row in test_X.iterrows():
#         associations.append((j, index, predictions[j]))
        if int(predictions[j]) <= test_y.loc[index]:
            if row['y'] == 1:
                total_s += 1
        total_c += int(predictions[j])
        results.append((total_c, total_s ))
        j += 1
#     total_c = 0
#     total_s = 0
#     results2 = []
#     sorted_assoc = sorted(associations, key=lambda tup: tup[2])
#     print(sorted_assoc[0])
#     for tup in sorted_assoc:
#         real_calls = test_y.loc[tup[1]]
#         real_data = test_X.loc[tup[1]]
#         if real_data['balance'] != mkt_df_filtered_cp.loc[tup[1]]['balance']:
#             print("BADDDD")
#         if tup[2] <= int(real_calls):
#             if real_data['y'] == 1:
#                 total_s += 1
#         total_c += int(tup[2])
#         results2.append((total_c, total_s ))
    return results
    

def decision_tree_multiclass(mkt_df_filtered, train_index, test_index):
    print("Decision Tree")
    mkt_df_filtered_cp = mkt_df_filtered.copy(deep = True)
#     criterion='entropy', max_depth= 28, min_impurity_decrease= 0.00005
    model = tree.DecisionTreeClassifier()
    return tree_approaches_abstracted(model, mkt_df_filtered_cp, train_index, test_index)


def xgboost_multiclass(mkt_df_filtered, train_index, test_index):
    print("XGBoost")
    mkt_df_filtered_cp = mkt_df_filtered.copy(deep = True)
    model = XGBClassifier()
    return tree_approaches_abstracted(model, mkt_df_filtered_cp, train_index, test_index)
    


# def decision_tree_multiclass(mkt_df_filtered, train_index, test_index):
#     mkt_df_filtered_cp = mkt_df_filtered.copy(deep = True)
    
#     features_to_transform = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'y']
#     for feature in features_to_transform:
#         le = LabelEncoder()
#         le.fit(mkt_df_filtered_cp[feature])
#         mkt_df_filtered_cp[feature] = le.transform(mkt_df_filtered_cp[feature])
    
#     train_df = mkt_df_filtered_cp.iloc[train_index]
#     test_df = mkt_df_filtered_cp.iloc[test_index]
    
#     model = tree.DecisionTreeClassifier()
#     train_y = train_df['campaign']
#     train_X = train_df.drop(columns = ['campaign'])
#     test_y = test_df['campaign']
#     test_X = test_df.drop(columns = ['campaign'])
#     model.fit(train_X, train_y)
#     predictions = model.predict(test_X)
    
#     total_c = 0
#     total_s = 0
#     results = []
#     associations = []
    
#     j = 0
#     for index, row in test_X.iterrows():
#         associations.append((j, index, predictions[j]))
#         if int(predictions[j]) <= test_y.loc[index]:
#             if row['y'] == 1:
#                 total_s += 1
#         total_c += int(predictions[j])
#         results.append((total_c, total_s ))
#         j += 1
    
#     total_c = 0
#     total_s = 0
#     results2 = []
#     sorted_assoc = sorted(associations, key=lambda tup: tup[2])
#     for tup in sorted_assoc:
#         real_calls = test_y.loc[tup[1]]
#         real_data = test_X.loc[tup[1]]
#         if tup[2] <= int(real_calls):
#             if real_data['y'] == 1:
#                 total_s += 1
#         total_c += int(real_calls)
#         results2.append((total_c, total_s ))
    
#     return results, results2

In [59]:
# This cell represents the logic required for the inital approach taken.
# Yields really good results thus far.
# This should remain untouched.

def construct_feature_combs(train_df, min_row_fs, age_query_strings, balance_query_strings):
    # At this point, we can run computations for the success rate of each sub attribute and join
    # the sub-attributes based on the output of k-means.
    poss = []

    # Education.
    all_ed = ['tertiary', 'secondary', 'primary', 'unknown']
    metric_vals = compute_metric_for_each_attribute(all_ed, train_df, 'education')
    education_cmbs = find_combinations(all_ed, metric_vals)

    # Occupation.
    all_jobs = ['student', 'retired', 'unemployed', 'admin.', 'management', 'self-employed', 'technician', 'unknown', 'services', 'housemaid', 'blue-collar', 'entrepreneur']
    metric_vals = compute_metric_for_each_attribute(all_jobs, train_df, 'job')
    job_cmbs = find_combinations(all_jobs, metric_vals)

    # Marital.
    all_ms = ['married', 'single', 'divorced']
    metric_vals = compute_metric_for_each_attribute(all_ms, train_df, 'marital')
    marital_cmbs = find_combinations(all_ms, metric_vals)

    # Default
    all_def = ['no', 'yes']
    default_cmbs = [['no'], ['yes']]

    # Loan
    all_ln = ['no', 'yes']
    loan_cmbs = [['no'], ['yes']]

    # Housing
    all_hs = ['no', 'yes']
    housing_cmbs = [['no'], ['yes']]

    poss.append(education_cmbs)
    poss.append(marital_cmbs)
    poss.append(job_cmbs)
    poss.append(default_cmbs)
    poss.append(loan_cmbs)
    poss.append(housing_cmbs)
    all_combs = list(itertools.product(*poss))

    # print("Number of combinations: ", len(all_combs)* len(age_query_strings) * len(balance_query_strings))

    # We can now go ahead and genreate the feature sets based on what was done previously.
    num_iter = 0
    combs_to_consider = {}
    fs_pick = {}

    # Setting up looping structures to generate all possibilities.
    for age_query in age_query_strings:
        df_filtered_final = train_df.query(age_query)
        for bal_query in balance_query_strings:
            df_filtered_final_2 = df_filtered_final.query(bal_query)
            for comb in all_combs:
                dict_final_query = construct_dict(comb)
                num_iter += 1
                extracted_df = extract_rows_feature_set(df_filtered_final_2, dict_final_query)
                key = (dict_final_query['education'], dict_final_query['job'], 
                       dict_final_query['marital'], dict_final_query['default'], 
                       dict_final_query['loan'], dict_final_query['housing'], 
                       bal_query, age_query)
                n_rows = extracted_df.shape[0]
                if n_rows >= min_row_fs:
                    results = compute_expected_succ_per_call_rate_feature_set(extracted_df, max_calls)
#                             max_loc = compute_optimal_call_no(results)
#                             if max_loc != -1:
                    combs_to_consider[key] = {
                                                'max_loc':0,
                                                'best_rate':0, 
                                                'overall_rate':results[max_calls-1]['expected'], 
                                                'n_rows':n_rows, 
                                                'results':results,
                                                'fs_customers':None
                                             }
                    fs_pick[key] = {
                                    'grad': 0.0, 
                                    'loc':0, 
                                    'finished':False, 
                                    'hull_points':None,
                                    'max_num_pts': -1,
                                    'results':None,
                                    'fs_customers' :None
                                   }
#               else:
#                   print("Invalid FS ! -> ", n_rows)
    for fs_key in combs_to_consider.keys():
        fs_customers = find_all_cust_feature_set(fs_key, test_df)
        combs_to_consider[fs_key]['fs_customers'] = fs_customers
        res = construct_hull_points(combs_to_consider[fs_key]['results'], max_calls)
        fs_pick[fs_key]['results'] = combs_to_consider[fs_key]['results']
        fs_pick[fs_key]['fs_customers'] = fs_customers
        if res is False:
            # print("Invalid Convex Hull Assignment")
            fs_pick[fs_key]['finished'] = True
        else:
            fs_pick[fs_key]['hull_points'] = res
            fs_pick[fs_key]['max_num_pts'] = len(res) - 1
    return combs_to_consider, fs_pick

In [60]:
%%time
# Code that sets up values to construct all possible feature combinations.

# Query Strings with filename being 'mod_1' 
# age_query_strings = ['age >= 10 & age <= 34', 'age >= 35 & age <= 45', 'age >= 46']
# age_ranges_2 = [(10, 34), (35, 45), (46, 100)]
# balance_query_strings = ['balance <= 450',' balance > 450']
# balance_ranges_2 = [(-10000, 450), (451, 105000)]

# Query Strings with filename being 'mod_2' 
age_query_strings_1 = ['age >= 18 & age <= 30', 'age >= 31 & age <= 47', 'age >= 48 & age <= 64', 'age >= 65']
age_ranges_1 = [(18,30), (31, 47), (48,64), (65,100)]
balance_query_strings_1 = ['balance <= 450',' balance > 450']
balance_ranges_1 = [(-10000, 450), (451, 105000)]

age_balance_ranges = []
age_balance_ranges.append((age_query_strings_1, age_ranges_1, balance_query_strings_1, balance_ranges_1))

# Max call number to consider.
max_calls = 20

# Pull and filter all calls <= 20.
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls)]
mkt_df_filtered = mkt_df_filtered[['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance', 'campaign', 'y']]
print(mkt_df.shape)

(45211, 17)
CPU times: user 46.2 ms, sys: 14 µs, total: 46.2 ms
Wall time: 44 ms


In [61]:
kf = KFold(n_splits=5, shuffle=True)
fold_data = []

for train_index, test_index in kf.split(mkt_df_filtered):
    fold_data.append((train_index, test_index))

In [62]:
fold_data

[(array([    0,     1,     2, ..., 44964, 44965, 44966]),
  array([   12,    21,    30, ..., 44956, 44957, 44958])),
 (array([    1,     2,     3, ..., 44961, 44963, 44965]),
  array([    0,     6,    13, ..., 44962, 44964, 44966])),
 (array([    0,     1,     2, ..., 44964, 44965, 44966]),
  array([    4,     5,    10, ..., 44954, 44955, 44961])),
 (array([    0,     2,     3, ..., 44962, 44964, 44966]),
  array([    1,     8,     9, ..., 44943, 44963, 44965])),
 (array([    0,     1,     4, ..., 44964, 44965, 44966]),
  array([    2,     3,     7, ..., 44927, 44931, 44960]))]

In [63]:
%%time

# Main code ... orchestrates everything!
phase_batch = {}
i = 0

for fold in fold_data:
    
    train_index = fold[0]
    test_index = fold[1]
    
    result_ratios_p1 = None
    result_ratios_p2 = None
    result_ratios_p3 = None
    result_ratios_p4 = None
    result_ratios_p5 = None
    result_ratios_p6 = None
    result_ratios_p7 = None
    result_ratios_p8 = None
    result_ratios_p9 = None

    i += 1
    print("At fold number: ", i)

    train_df = mkt_df_filtered.iloc[train_index]
    test_df = mkt_df_filtered.iloc[test_index]

    combs_to_consider, fs_pick = construct_feature_combs(train_df, 0, age_query_strings_1, balance_query_strings_1)

    # Testing Phase 1 -> Baseline test with shuffling of all customers and calling them ..
    results_p1, num_calls = call_everyone(test_df)

    # Testing Phase 2 -> Order how we call customers - based on the overall s/c ratio ..
    results_p2 = greedy_approach(combs_to_consider)

    # Testing Phase 4 -> Convex Hull - Gradient Ascent Approach ..
    results_p4 = convex_hull(fs_pick, num_calls)

#     # Testing Phase 5 -> If we were godlike and knew all ..
#     result_ratios_p5 = upper_bound(test_df)
    
#     # New Stuff
#     result_ratios_p6, groups1 = clustering_age_balance_grouped(mkt_df_filtered, train_index, test_index, 20, balance_ranges_1, age_ranges_1)

#     # result_ratios_p7, groups2 = clustering_age_balance_not_grouped(mkt_df_filtered, train_index, test_index, 20, balance_ranges_2, age_ranges_2)

#     # result_ratios_p8, groups3 = new_approach_ratio_grouping_percentage(mkt_df_filtered, train_index, test_index, 500, age_ranges_2, balance_ranges_2)
    
#     result_ratios_p7 = xgboost_multiclass(mkt_df_filtered, train_index, test_index)
    
#     result_ratios_p8 = decision_tree_multiclass(mkt_df_filtered, train_index, test_index)
    
    # Add all results together for this fold.
    phase_batch[i] = {'p1':results_p1, 'p2':results_p2, 
                      'p4':results_p4, 'p5':result_ratios_p5,
                      'p6':result_ratios_p6, 'p7':result_ratios_p7, 
                      'p8':result_ratios_p8, 'p9':result_ratios_p9
                     }

# with open('full_max20.json', 'w') as fp:
#     json.dump(phase_batch, fp)

At fold number:  1
Call all Customers Approach
Greedy Approach
Gradient Ascent Approach
Performing initial update .. 
Perforiming sort .. 
Finished sort .. in while loop 
At fold number:  2
Call all Customers Approach
Greedy Approach
Gradient Ascent Approach
Performing initial update .. 
Perforiming sort .. 
Finished sort .. in while loop 
At fold number:  3
Call all Customers Approach
Greedy Approach
Gradient Ascent Approach
Performing initial update .. 
Perforiming sort .. 
Finished sort .. in while loop 
At fold number:  4
Call all Customers Approach
Greedy Approach
Gradient Ascent Approach
Performing initial update .. 
Perforiming sort .. 
Finished sort .. in while loop 
At fold number:  5
Call all Customers Approach
Greedy Approach
Gradient Ascent Approach
Performing initial update .. 
Perforiming sort .. 
Finished sort .. in while loop 
CPU times: user 6min 12s, sys: 669 ms, total: 6min 13s
Wall time: 6min 6s


In [None]:
me = {'1':{'ratio':0.25}, '2':{'ratio':0.59}}
sorted(me.items(), key=lambda fs: fs[1]['ratio'], reverse = True)

In [64]:
results_p1

([0,
  50,
  102,
  150,
  201,
  251,
  300,
  350,
  400,
  450,
  504,
  1003,
  1502,
  2001,
  2500,
  3000,
  3504,
  4000,
  4500,
  5014,
  5500,
  6004,
  6501,
  7005,
  7503,
  8001,
  8503,
  9000,
  9507,
  10004,
  10500,
  11005,
  11501,
  12000,
  12502,
  13001,
  13500,
  14001,
  14501,
  15002,
  15500,
  16010,
  16509,
  17000,
  17501,
  18002,
  18500,
  19001,
  19505,
  20003,
  20500,
  21001,
  21501,
  22003,
  22503,
  23003,
  23501,
  23751],
 [0,
  2,
  4,
  7,
  9,
  11,
  16,
  19,
  24,
  25,
  28,
  50,
  72,
  98,
  125,
  149,
  174,
  212,
  225,
  257,
  278,
  297,
  326,
  347,
  375,
  390,
  423,
  453,
  475,
  507,
  531,
  554,
  573,
  599,
  620,
  641,
  659,
  684,
  717,
  731,
  750,
  769,
  787,
  809,
  835,
  861,
  889,
  910,
  923,
  936,
  959,
  985,
  1012,
  1039,
  1059,
  1076,
  1102,
  1117],
 13635489.0)

In [40]:
with open('xg_dt_correct1'+'.json', 'w') as fp:
    json.dump(phase_batch, fp)

In [39]:
for key in phase_batch.keys():
    print(key)
    new_ratios = []
    for el in phase_batch[key]['p9']:
        new_ratios.append((int(el[0]), int(el[1])))
    phase_batch[key]['p9'] = new_ratios 

1
2
3
4
5


In [None]:
for key in phase_batch.keys():
    print(key)

In [11]:
for index, row in mkt_df_filtered.iterrows():
    if index > 45173:
        print(index)
    res = mkt_df_filtered.loc[index]

45174
45175
45176
45177
45178
45179
45180
45181
45182
45183
45184
45185
45186
45187
45188
45189
45190
45191
45192
45193
45194
45195
45196
45197
45198
45199
45200
45201
45202
45203
45204
45205
45206
45207
45208
45209
45210


In [10]:
 mkt_df_filtered.shape

(45173, 10)

In [13]:
mkt_df_filtered.iloc[45174]

IndexError: single positional indexer is out-of-bounds