In [19]:
# Imports

import os
import math
import random
import operator as op
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math, itertools
import statistics
import json
import hdbscan

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics.pairwise import euclidean_distances
from operator import itemgetter
from statistics import mean
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
from matplotlib.ticker import StrMethodFormatter

from scipy.spatial import ConvexHull

In [20]:
# Helper Functions Across All Methods

def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df
  
    
def plot_graph_new(results, max_calls, list_passed, title, name = "1"):
    x_pts = [i+1 for i in range(0, max_calls)]
    if list_passed:
        y_pts = results
    else:    
        y_pts = [results[i]['expected'] for i in range(0, max_calls)]
    print(y_pts)
    plt.title(title)
    plt.plot(x_pts, y_pts, linewidth=2)
    plt.xlabel("Call Number")
    plt.ylabel("Success Per Call Rate")
    plt.ylim(0, 0.4)
    plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.3f}'))
#     plt.axvline(x=0, color ="black", linewidth=1)
#     plt.axhline(y=0, color ="black", linewidth=1)
    plt.xticks(np.arange(1, max_calls+1, 1))
#     plt.show()
    plt.savefig(str(name) + ".pdf")
    plt.close()
    

def div(a,b):
    if int(b) == 0:
        return 0.0
    else:
        return a/b
    

# Used for creating all possible combinations of the features.
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = iterable
    return itertools.chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


def convert(list): 
    return tuple(list) 


def construct_dict(feature_comb):
    new_dict = {}
    new_dict['education'] = convert(feature_comb[0])
    new_dict['job'] = convert(feature_comb[2])
    new_dict['marital'] = convert(feature_comb[1])
    new_dict['default'] = convert(feature_comb[3])
    new_dict['loan'] = convert(feature_comb[4])
    new_dict['housing'] = convert(feature_comb[5])
    return new_dict


# This is the new metric (success per call rate).
def compute_expected_succ_per_call_rate_feature_set(fs_df, no_calls_considered):
    expected_values_call_nums = []
    for i in range(1, no_calls_considered + 1):
        expected_values_call_nums.append({'succ':0, 'total_calls':0, 'expected':0.0})
        for index, row in fs_df.iterrows():
            no_calls = row['campaign']
            if no_calls <= i:
                if row['y'] == "yes":
                    expected_values_call_nums[i-1]['succ'] += 1
                expected_values_call_nums[i-1]['total_calls'] += no_calls
            else:
                expected_values_call_nums[i-1]['total_calls'] += i
    for loc, item in enumerate(expected_values_call_nums):
        expected_values_call_nums[loc]['expected'] = div(item['succ'], item['total_calls'])
    return expected_values_call_nums


def compute_optimal_call_no(results):
    max_loc = max(range(len(results)), key=lambda index: results[index]['expected'])
    if max_loc == 0 and results[max_loc]['expected'] == 0.0:
        return -1
    return max_loc


# Given a dictionary of what attributes comprise a feature set, we can get all rows corresponding to this feature set.
def extract_rows_feature_set(fs_df, feature_labels = {'education':['tertiary', 'unknown'], 
                                                      'job':['management', 'technician', 'blue-collar'], 
                                                      'marital':['single'], 'default':['no'], 
                                                      'housing':['no'], 'loan':['no']}):
    for key in feature_labels:
        feature_labels_query_str = ''
        arr = feature_labels[key]
        for label in arr:
            feature_labels_query_str += (key + ' == "'+ label + '" | ')
        feature_labels_query_str = feature_labels_query_str[:-3]
        fs_df = fs_df.query(feature_labels_query_str)
    return fs_df


def find_matching_attribute_comb(row_value, all_combs):
    query = None
    for comb in all_combs:
        for item in comb:
            if item == row_value:
                query = comb
    return query


def compute_metric(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += row['campaign']
    return div(total_successes, total_calls)

def compute_metric_2(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += min(row['campaign'], )
    return div(total_successes, total_calls)


def compute_metric_for_each_attribute(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    for index, value in enumerate(all_values):
        v_query = "{0} == '{1}'".format(attrib, value)
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals


def compute_metric_for_each_attribute_range(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    query_strings = []
    for index, value in enumerate(all_values):
        v_query = "{0} >= {1} & {2} < {3}".format(attrib, value[0], attrib, value[1])
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
        query_strings.append(v_query)
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals, query_strings


def find_combinations(sub_attributes, ratios):
    num_iter = len(ratios)
    sil_scores = []
    # Making use of the K-Means algorithm ... number of centroids are from 2 to n-1.
    for clust_num in range(2, num_iter):
        kmeans = KMeans(n_clusters = clust_num)
        kmeans.fit(ratios.reshape(-1,1))
        results = kmeans.labels_
        sil_scores.append((silhouette_score(ratios.reshape(-1,1), results, metric='euclidean'), results, clust_num))
#     print(sil_scores)
    # We make use of the silhouette score to determine the ideal number of centroids.
    sorted_sil_scores = sorted(sil_scores, key=lambda x: x[0], reverse = True)
    # We then use this ideal number of centroids to determine which sub attributes should be aggregated.
    joined_sub_attributes = []
    for i in range(0, sorted_sil_scores[0][2]):
        joined_sub_attributes.append([])
    join_list = sorted_sil_scores[0][1]
    for index, value in enumerate(join_list):
        pos = join_list[index]
        joined_sub_attributes[pos].append(sub_attributes[index])
    return_joined_sub_attributes = []
    for arr in joined_sub_attributes:
        similar_els_gp = []
        for item in arr:
            similar_els_gp.append(str(item))
        return_joined_sub_attributes.append(similar_els_gp)
#     print(return_joined_sub_attributes)
    return return_joined_sub_attributes

# The following is the format of the way in which this method should be called.
# find_combinations(['a', 'b', 'c', 'd'], np.array([1, 4, 7, 90]), "job").

def find_all_cust_feature_set(fs, df):
    comb = {
        'education':fs[0], 
         'job':fs[1], 
         'marital':fs[2], 
         'default':fs[3], 
         'loan':fs[4], 
         'housing':fs[5]
    }
    res_1 = df.query(fs[6])
    res_2 = res_1.query(fs[7])
    res_final = extract_rows_feature_set(res_2, comb)
    return res_final


def construct_hull_points(results, max_calls):
    pts = []
    for x in range(0, max_calls):
        s = results[x]['succ']
        c = results[x]['total_calls']
        pts.append([c,s])
#     print("Num points is ", len(pts))
    pts = np.array(pts)
    try:
        hull = ConvexHull(pts)
        verts = hull.vertices
#         print(pts)
#         plt.plot(pts[:,0], pts[:,1], 'o')
#         for simplex in hull.simplices:
#             plt.plot(pts[simplex, 0], pts[simplex, 1], 'k-')
        if not np.isin(max_calls - 1, verts):
            verts = np.append(max_calls - 1, verts)
        verts = np.sort(verts)
        return verts.tolist()
    except:
        return False

    
def gradient_update(key, fs_pick):
    fs = fs_pick[key]
    fs_results = fs['results']
    hull_pts = fs['hull_points']
    loc = fs['loc']
    max_loc = fs['max_num_pts']
    grad = 0.0
    if loc <= max_loc:
        if loc == 0:
            grad = div(fs_results[hull_pts[loc]]['succ'], fs_results[hull_pts[loc]]['total_calls'])
        else:
            grad = div(fs_results[hull_pts[loc]]['succ'] - fs_results[hull_pts[loc-1]]['succ'] , fs_results[hull_pts[loc]]['total_calls'] - fs_results[hull_pts[loc-1]]['total_calls'])
        fs_pick[key]['grad'] = grad
    else:
        fs_pick[key]['finished'] = True

        
def get_features(row, feature_names):
    fs = []
    for index, val in enumerate(feature_names):
        if int(row[index]) == 1:
            fs.append(val)
    return fs

In [21]:
# This cell holds functions that are utilized by each of the methods defined.

def group_age(row, age_ranges):
#     print(age_ranges)
    age = int(row['age'])
    age_val = None
    for index, age_range in enumerate(age_ranges):
        if op.ge(age, age_range[0]) and op.le(age, age_range[1]):
            age_val = index + 1
    if age_val == None:
        print("Failed Assignment for age: ", age)
#         mkt_df_filtered_kmeans.loc[loc, 'age'] = age_val
    return age_val
        

def group_balance(row, balance_ranges):
#     print(balance_ranges)
    bal = int(row['balance'])
    bal_val = None
    for index, balance_range in enumerate(balance_ranges):
        if op.ge(bal, balance_range[0]) and op.le(bal, balance_range[1]):
            bal_val = index + 1
    if bal_val == None:
        print("Failed Assignment for balance: ", bal)
#         mkt_df_filtered_kmeans.loc[loc, 'balance'] = bal_val
    return bal_val


def group_feature(df, col_name, func, ranges):
    for index, row in df.iterrows():
        df.loc[index, col_name] = func(row, ranges)

def compute_ratio_all_users(df, train_indicies):
    ratio_values = []
    for val in train_indicies:
        row = df.iloc[val]
        if row['y'] == "yes":
            ratio_values.append((val, div(1, row['campaign'])))
        else:
            ratio_values.append((val, 0.0))
    return ratio_values


def compute_freq_percentage(mappings):
    total = 0
    for user_mapping in mappings.keys():
        total += mappings[user_mapping]['freq']
    for user_mapping in mappings.keys():
        mappings[user_mapping]['percentage'] = div(mappings[user_mapping]['freq'], total)

In [26]:
# Each function represents each method attempted.

def call_everyone(test_df):
    print("Call all Customers Approach")
    call_check_points = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, 23500, 24000, 24500, 25000, 25500, 26000, 26500, 27000, 27500, 28000, 28500, 29000, 29500, 30000, 30500, 31000, 31500, 32000, 32500, 33000, 33500, 34000, 34500, 35000, 35500, 36000, 36500, 37000, 37500, 38000, 38500, 39000, 39500, 40000, 40500, 41000, 41500, 42000, 42500, 43000, 43500, 44000, 44500, 45000, 45500, 46000, 46500, 47000, 47500, 48000, 48500, 49000, 49500, 50000, 50500, 51000, 51500, 52000, 52500, 53000, 53500, 54000, 54500, 55000, 55500, 56000, 56500, 57000, 57500, 58000, 58500, 59000, 59500, 60000, 60500, 61000, 61500, 62000, 62500, 63000, 63500, 64000, 64500, 65000, 65500, 66000, 66500, 67000, 67500, 68000, 68500, 69000, 69500, 70000, 70500, 71000, 71500, 72000, 72500, 73000, 73500, 74000, 74500, 75000, 75500, 76000, 76500, 77000, 77500, 78000, 78500, 79000, 79500, 80000, 80500, 81000, 81500, 82000, 82500, 83000, 83500, 84000, 84500, 85000, 85500, 86000, 86500, 87000, 87500, 88000, 88500, 89000, 89500, 90000, 90500, 91000, 91500, 92000, 92500, 93000, 93500, 94000, 94500, 95000, 95500, 96000, 96500, 97000, 97500, 98000, 98500, 99000, 99500, 100000, 100500, 101000, 101500, 102000, 102500, 103000, 103500, 104000, 104500, 105000, 105500, 106000, 106500, 107000, 107500, 108000, 108500, 109000, 109500, 110000, 110500, 111000, 111500, 112000, 112500, 113000, 113500, 114000, 114500, 115000, 115500, 116000, 116500, 117000, 117500, 118000, 118500, 119000, 119500, 120000, 120500, 121000, 121500, 122000, 122500, 123000, 123500, 124000, 124500, 125000, 125500, 126000, 126500, 127000, 127500, 128000, 128500, 129000, 129500, 130000, 130500, 131000, 131500, 132000, 132500, 133000, 133500, 134000, 134500, 135000, 135500, 136000, 136500, 137000, 137500, 138000, 138500, 139000, 139500]
    result_ratios_p1 = []
    cp_loc = 0
    num_succ = 0
    num_calls = 0
    res = test_df.reindex(np.random.permutation(test_df.index))
    for loc, row in res.iterrows():
        if num_calls >= call_check_points[cp_loc]:
            cp_loc += 1
            result_ratios_p1.append((num_succ, num_calls))
        num_calls += row['campaign']
        if row['y'] == "yes":
            num_succ += 1
    result_ratios_p1.append((num_succ, num_calls))
    return result_ratios_p1, num_calls


def greedy_approach(combs_to_consider):
    print("Greedy Approach")
    persons_to_call_overall = {k: v for k, v in sorted(combs_to_consider.items(), key=lambda fs: fs[1]['overall_rate'], reverse = True)}
    num_succ = 0
    num_calls = 0
    result_ratios_p2 = []
    # print(type(persons_to_call_overall))
    result_ratios = []
    for key in persons_to_call_overall.keys():
        for loc, cust in persons_to_call_overall[key]['fs_customers'].iterrows():
            num_calls += cust['campaign']
            if cust['y'] == "yes":
                num_succ +=1
        result_ratios_p2.append((num_succ, num_calls))
    return result_ratios_p2


def convex_hull(fs_pick, num_calls):
    print("Gradient Ascent Approach")
    result_ratios_p4 = []
    total_s = 0
    total_c = 0
    print("Performing initial update .. ")
    for key in fs_pick.keys():
        gradient_update(key, fs_pick)
    print("Perforiming sort .. ")
    # Sort based on gradient.
    optimal_choices = [(k,v) for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['grad'], reverse = True)]
    # Call best feature set, update gradient for this feature set and re-sort all feature sets.
    # Rinse and repeat!
    print("Finished sort .. in while loop ")
    while(total_c <= num_calls):
        best_loc = 0
        while(best_loc < len(optimal_choices) and optimal_choices[best_loc][1]['finished'] == True):
            best_loc += 1
        if best_loc == len(optimal_choices):
            break
        fs_key = optimal_choices[best_loc][0]
        fs_data = optimal_choices[best_loc][1]
        if fs_data['finished'] == False:
            loc = fs_data['loc']
            if loc == 0:
                call_start = 1
                call_end = fs_data['hull_points'][loc] + 1
            else:
                call_start = fs_data['hull_points'][loc-1] + 2
                call_end = fs_data['hull_points'][loc] + 1
            for call in range(call_start, call_end + 1, 1):
                for loc, row in fs_pick[fs_key]['fs_customers'].iterrows():
                    if row['campaign'] == call:
                        total_c += 1
                        if row['y'] == "yes":
                            total_s += 1
                    elif row['campaign'] > call:
                        total_c += 1
            result_ratios_p4.append((total_s, total_c))
            fs_pick[fs_key]['loc'] += 1
            gradient_update(fs_key, fs_pick)
            optimal_choices = [(k,v) for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['grad'], reverse = True)]
    return result_ratios_p4


def upper_bound(test_df):
    print("Upper Bound Approach")
    num_succ = 0
    num_calls = 0
    result_ratios_p5 = []
    res_df = test_df.query("y == 'yes'")
    for x in range(1, max_calls + 1):
        res_df2 = res_df.query("campaign == {0}".format(x))
        num_cust = len(res_df2)
        num_succ += num_cust
        num_calls = num_calls + (num_cust * x)
        result_ratios_p5.append((num_succ, num_calls))
    res_df = test_df.query("y == 'no'")
    for x in range(1, max_calls + 1):
        res_df2 = res_df.query("campaign == {0}".format(x))
        num_cust = len(res_df2)
        num_calls = num_calls + (num_cust * x)
        result_ratios_p5.append((num_succ, num_calls))
    return result_ratios_p5
    
    
def new_approach_ratio_grouping_percentage(df, train_indicies, test_indicies, group_size, age_groupings, balance_groupings):
    # Ensuring that we can binary encode any row in our dataset. We also group age and balance values 
    # from each row into ranges.
    print("Step 1")
    df_copy = df.copy(deep=True)
    group_feature(df_copy, "age", group_age, age_groupings)
    group_feature(df_copy, "balance", group_balance, balance_groupings)
    # Build the customers for each group.
    print("Step 2")
    ratio_arr = compute_ratio_all_users(df_copy, train_indicies)
    ratio_arr_sorted = sorted(ratio_arr, key=lambda tup: tup[1], reverse = True)
    train_size = len(ratio_arr_sorted)
    groupings = {}
    for loc in range(0, train_size):
        group_key = str(int(loc/group_size))
        if group_key not in groupings.keys():
            groupings[group_key] = {'indicies':[], 'mappings':{}, 'results':None} 
        groupings[group_key]['indicies'].append(ratio_arr_sorted[loc][0])
    print(len(groupings.keys()))
    # For each group, we find the unique feature combinations and store them in a list. 
    # We also store the results - s/c ratio for call numbers from 1-20.
    test_calls = {}
    print("Step 3")
    for group_key in groupings.keys():
        users_df = df_copy.iloc[groupings[group_key]['indicies']]
        mappings = groupings[group_key]['mappings']
        for row in users_df.itertuples():
            user_mapping = str((row.job, row.marital, row.education, row.default,
                               row.housing, row.loan, row.age, row.balance))
            if user_mapping not in mappings.keys():
                mappings[user_mapping] = {'freq':0, 'percentage':0.0}
            else:
                mappings[user_mapping]['freq'] += 1
        groupings[group_key]['results'] = compute_expected_succ_per_call_rate_feature_set(users_df, 20)
        compute_freq_percentage(mappings)
        test_calls[group_key] = {'locs_to_call':[], 'overall_rate':groupings[group_key]['results'][19]}
    # print(test_calls)
    # For the test set, we need to map each user to the most appropriate cluster.
    print("Step 4")
    missed = 0
    for loc in test_indicies:
        row = df_copy.iloc[loc]
        user_mapping = str((row['job'], row['marital'], row['education'], row['default'],
                            row['housing'], row['loan'], row['age'], row['balance']))
        all_groupings_keys = list(groupings.keys())
        best_group_key = None
        best_ratio = -1.0
        for group_key in all_groupings_keys:
            if user_mapping in groupings[group_key]['mappings']:
                if best_group_key is None:
                    best_group_key = group_key
                    best_ratio = groupings[group_key]['mappings'][user_mapping]['percentage']
                else:
                    if groupings[best_group_key]['mappings'][user_mapping]['percentage'] > best_ratio:
                        best_group_key = group_key
                        best_ratio = groupings[group_key]['mappings'][user_mapping]['percentage']
        if best_group_key is not None:
            test_calls[best_group_key]['locs_to_call'].append(loc)
        else:
            missed += 1
    print("We missed:", missed)
    # Call users ... those with the highest ratios are called first.
    test_calls_sorted = sorted(test_calls.items(), key=lambda fs: fs[1]['overall_rate']['expected'], reverse = True)
    print("Step 5")
    num_succ = 0
    num_calls = 0
    result_ratios = []
    for test_call in test_calls_sorted:
        for cust_loc in test_call[1]['locs_to_call']:
            row = df_copy.iloc[cust_loc]
            if row['y'] == "yes":
                num_succ += 1
            num_calls += int(row['campaign'])
        result_ratios.append((num_succ, num_calls))
    return result_ratios, groupings


def clustering_approach_abstracted(clusterer, feature_names, train_df, test_df, train_df_encoded, test_df_encoded):
    groupings = {}
    predictions = clusterer.labels_
    # We assign to each group, the similar indicies. This was based on the clustering approach.
    for index, group in enumerate(predictions):
        if str(group) not in groupings.keys():
            groupings[str(group)] = {'train_indicies':[], 'unique_keys':{}, 'results':None, 'test_indicies':[]}
        groupings[str(group)]['train_indicies'].append(index)
    print("Check 1")
    # For all customers belonging to each grouping, we find the unique keys and compute the success per call
    # ratio for call numbers 1-20.
    for group in groupings.keys():
        for index in groupings[group]['train_indicies']:
            cust_info = train_df.iloc[index]
            cust_features = get_features(train_df_encoded[index], feature_names)
            # cust_features = cust_features[0:8]
            if str(cust_features) not in groupings[group]['unique_keys'].keys():
                groupings[group]['unique_keys'][str(cust_features)] = {'#_ocurr': 1}
            else:
                groupings[group]['unique_keys'][str(cust_features)]['#_ocurr'] += 1
        results = compute_expected_succ_per_call_rate_feature_set(train_df.iloc[groupings[group]['train_indicies']], 20)
        groupings[group]['results'] = results
    print("Check 2")
    # This process makes use of the test set and determines the ideal cluster for a customer.
    for index in range(0, len(test_df_encoded), 1):
        encoded_customer_data = test_df_encoded[index]
        test_labels, strengths = hdbscan.approximate_predict(clusterer, [encoded_customer_data])
        groupings[str(test_labels[0])]['test_indicies'].append(index)
    print("Check 3")
    # Perform sorting of groups based on success per call rate.
    sorted_final_call = {k: v for k, v in sorted(groupings.items(), key=lambda item: item[1]['results'][19]['expected'], reverse = True)}
    print("Check 4")
    # Go about calling customers, keep track of the success per call rate as we switch from group to group.
    total_s = 0
    total_c = 0
    result_ratios = []
    for group in sorted_final_call:
        for cust_index in sorted_final_call[group]['test_indicies']:
            row = test_df.iloc[cust_index]
            if row['y'] == "yes":
                total_s += 1
            total_c += int(row['campaign'])
        result_ratios.append((total_s, total_c))
    return result_ratios, groupings
        
        
# The encoding process can also be varied to not include age and balance.
def clustering_age_balance_grouped(train_df, test_df, min_cluster_size, balance_groupings, age_groupings):
    print("HDBScan Clustering No Ratio - Using Approximate Predict Function from HDBScan Library")
    train_df_copy = train_df.copy(deep = True)
    test_df_copy = test_df.copy(deep = True)
    group_feature(train_df_copy, "age", group_age, age_groupings)
    group_feature(train_df_copy, "balance", group_balance, balance_groupings)
#     print(train_df_copy.head(10))
    encoder = OneHotEncoder()
    train_df_encoded = encoder.fit_transform(train_df_copy.drop(columns=['y', 'campaign'])).toarray()
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True)
#     print(len(train_df_encoded[0]))
    clusterer.fit(train_df_encoded)
    group_feature(test_df_copy, "age", group_age, age_groupings)
    group_feature(test_df_copy, "balance", group_balance, balance_groupings)
    test_df_encoded = encoder.fit_transform(test_df_copy.drop(columns=['y', 'campaign'])).toarray()
    feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
    return clustering_approach_abstracted(clusterer, feature_names, train_df, test_df, train_df_encoded, test_df_encoded)


# The encoding process can also be varied to not include age and balance.
def clustering_age_balance_not_grouped(train_df, test_df, min_cluster_size, balance_groupings, age_groupings):
    print("HDBScan Clustering No Ratio - Using Approximate Predict Function from HDBScan Library")
    train_df_copy = train_df.copy(deep = True)
    test_df_copy = test_df.copy(deep = True)
    encoder = OneHotEncoder()
    train_df_encoded = encoder.fit_transform(train_df_copy.drop(columns=['y', 'campaign', 'age', 'balance'])).toarray()
    train_df_encoded = np.column_stack((train_df_encoded, train_df_copy['age'].to_numpy()))
    train_df_encoded = np.column_stack((train_df_encoded, train_df_copy['balance'].to_numpy()))
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True)
    clusterer.fit(train_df_encoded)
    test_df_encoded = encoder.fit_transform(test_df_copy.drop(columns=['y', 'campaign', 'age', 'balance'])).toarray()
    test_df_encoded = np.column_stack((test_df_encoded, test_df_copy['age'].to_numpy()))
    test_df_encoded = np.column_stack((test_df_encoded, test_df_copy['balance'].to_numpy()))
    feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan'])
    return clustering_approach_abstracted(clusterer, feature_names, train_df, test_df, train_df_encoded, test_df_encoded)

# def clustering_age_balance_grouped_ratio_gradient_ascent(train_df, test_df, min_cluster_size, balance_groupings, age_groupings, num_calls):
#     print("In P8")
#     group_age(train_df, age_groupings)
#     group_balance(train_df, balance_groupings)
#     train_df, ratio_df = compute_ratio_all_users(train_df)
#     encoder = OneHotEncoder()
#     train_df_encoded = encoder.fit_transform(train_df.drop(columns=['y', 'campaign'])).toarray()
#     train_df_encoded = np.column_stack((train_df_encoded, ratio_df['ratio'].to_numpy()))
#     feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
#     clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
#     clusterer.fit(train_df_encoded)
#     predictions = clusterer.labels_
#     test_df = mkt_df_filtered.iloc[test_index]
#     group_age(test_df, age_groupings)
#     group_balance(test_df, balance_groupings)
#     test_df_encoded = encoder.fit_transform(test_df.drop(columns=['y', 'campaign'])).toarray()
# #     return None, None, (None, None)
#     return abstraction_new_approach_gradient_ascent(predictions, feature_names, train_df, test_df, train_df_encoded, test_df_encoded, num_calls)

In [27]:
# This cell represents the logic required for the inital approach taken.
# Yields really good results thus far.
# This should remain untouched.

def construct_feature_combs(train_df):
    # At this point, we can run computations for the success rate of each sub attribute and join
    # the sub-attributes based on the output of k-means.
    poss = []

    # Education.
    all_ed = ['tertiary', 'secondary', 'primary', 'unknown']
    metric_vals = compute_metric_for_each_attribute(all_ed, train_df, 'education')
    education_cmbs = find_combinations(all_ed, metric_vals)

    # Occupation.
    all_jobs = ['student', 'retired', 'unemployed', 'admin.', 'management', 'self-employed', 'technician', 'unknown', 'services', 'housemaid', 'blue-collar', 'entrepreneur']
    metric_vals = compute_metric_for_each_attribute(all_jobs, train_df, 'job')
    job_cmbs = find_combinations(all_jobs, metric_vals)

    # Marital.
    all_ms = ['married', 'single', 'divorced']
    metric_vals = compute_metric_for_each_attribute(all_ms, train_df, 'marital')
    marital_cmbs = find_combinations(all_ms, metric_vals)

    # Default
    all_def = ['no', 'yes']
    default_cmbs = [['no'], ['yes']]

    # Loan
    all_ln = ['no', 'yes']
    loan_cmbs = [['no'], ['yes']]

    # Housing
    all_hs = ['no', 'yes']
    housing_cmbs = [['no'], ['yes']]

    poss.append(education_cmbs)
    poss.append(marital_cmbs)
    poss.append(job_cmbs)
    poss.append(default_cmbs)
    poss.append(loan_cmbs)
    poss.append(housing_cmbs)
    all_combs = list(itertools.product(*poss))

    # print("Number of combinations: ", len(all_combs)* len(age_query_strings) * len(balance_query_strings))

    # We can now go ahead and genreate the feature sets based on what was done previously.
    num_iter = 0
    combs_to_consider = {}
    fs_pick = {}

    # Setting up looping structures to generate all possibilities.
    for age_query in age_query_strings:
        df_filtered_final = train_df.query(age_query)
        for bal_query in balance_query_strings:
            df_filtered_final_2 = df_filtered_final.query(bal_query)
            for comb in all_combs:
                dict_final_query = construct_dict(comb)
                num_iter += 1
                extracted_df = extract_rows_feature_set(df_filtered_final_2, dict_final_query)
                key = (dict_final_query['education'], dict_final_query['job'], 
                       dict_final_query['marital'], dict_final_query['default'], 
                       dict_final_query['loan'], dict_final_query['housing'], 
                       bal_query, age_query)
                n_rows = extracted_df.shape[0]
                if n_rows >= cp:
                    results = compute_expected_succ_per_call_rate_feature_set(extracted_df, max_calls)
#                             max_loc = compute_optimal_call_no(results)
#                             if max_loc != -1:
                    combs_to_consider[key] = {
                                                'max_loc':0,
                                                'best_rate':0, 
                                                'overall_rate':results[max_calls-1]['expected'], 
                                                'n_rows':n_rows, 
                                                'results':results,
                                                'fs_customers':None
                                             }
                    fs_pick[key] = {'grad': 0.0, 
                                    'loc':0, 
                                    'finished':False, 
                                    'hull_points':None,
                                    'max_num_pts': -1,
                                    'results':None,
                                    'fs_customers' :None
                                   }
#               else:
#                   print("Invalid FS ! -> ", n_rows)
    for fs_key in combs_to_consider.keys():
        fs_customers = find_all_cust_feature_set(fs_key, test_df)
        combs_to_consider[fs_key]['fs_customers'] = fs_customers
        res = construct_hull_points(combs_to_consider[fs_key]['results'], max_calls)
        fs_pick[fs_key]['results'] = combs_to_consider[fs_key]['results']
        fs_pick[fs_key]['fs_customers'] = fs_customers
        if res is False:
            print("Invalid Convex Hull Assignment")
            fs_pick[fs_key]['finished'] = True
        else:
            fs_pick[fs_key]['hull_points'] = res
            fs_pick[fs_key]['max_num_pts'] = len(res) - 1
    return combs_to_consider, fs_pick

In [32]:
%%time
# Code that sets up values to construct all possible feature combinations.

# Query Strings with filename being 'mod_1' 
# age_query_strings = ['age >= 10 & age <= 34', 'age >= 35 & age <= 45', 'age >= 46']
# age_ranges_2 = [(10, 34), (35, 45), (46, 100)]
# balance_query_strings = ['balance <= 450',' balance > 450']
# balance_ranges_2 = [(-10000, 450), (451, 105000)]

# Query Strings with filename being 'mod_2' 
age_query_strings = ['age >= 18 & age <= 30', 'age >= 31 & age <= 47', 'age >= 48 & age <= 64', 'age >= 65']
age_ranges_2 = [(18,30), (31, 47), (48,64), (65,100)]
balance_query_strings = ['balance <= 450',' balance > 450']
balance_ranges_2 = [(-10000, 450), (451, 105000)]

# Max call number to consider.
max_calls = 20

# Pull and filter all calls <= 20.
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls)]
mkt_df_filtered = mkt_df_filtered[['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance', 'campaign', 'y']]
mkt_df_filtered_kmeans = mkt_df_filtered[['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance']]
print(mkt_df.shape)

(45211, 17)
CPU times: user 40.1 ms, sys: 8.02 ms, total: 48.1 ms
Wall time: 46.9 ms


In [33]:
%%time

# Main code ... orchestrates everything!

# Splitting dataframe into data and result dataframes.
X = mkt_df_filtered.iloc[:,0:len(mkt_df_filtered.columns)-1]
y = mkt_df_filtered.iloc[:,-1]

cut_points = [20]

for cp in cut_points:

    for j in range(1,2):
        
        phase_batch = {}
        kf = KFold(n_splits=5, shuffle=True)
        i = 0
        for train_index, test_index in kf.split(X):
            
            result_ratios_p1 = None
            result_ratios_p2 = None
            result_ratios_p3 = None
            result_ratios_p4 = None
            result_ratios_p5 = None
            result_ratios_p6 = None
            result_ratios_p7 = None
            result_ratios_p8 = None
            result_ratios_p9 = None
            
            i += 1
            
            print("At fold number: ", i)

            train_df = mkt_df_filtered.iloc[train_index]
            test_df = mkt_df_filtered.iloc[test_index]
            
            combs_to_consider, fs_pick = construct_feature_combs(train_df)
            
            
            # Testing Phase 1 -> Baseline test with shuffling of all customers and calling them ..
            result_ratios_p1, num_calls = call_everyone(test_df)


            # Testing Phase 2 -> Order how we call customers - based on the overall s/c ratio ..
            result_ratios_p2 = greedy_approach(combs_to_consider)


            # Testing Phase 4 -> Convex Hull - Gradient Ascent Approach ..
            result_ratios_p4 = convex_hull(fs_pick, num_calls)
            
            
            # Testing Phase 5 -> If we were godlike and knew all ..
            result_ratios_p5 = upper_bound(test_df)
            
            result_ratios_p6, groups1 = clustering_age_balance_grouped(train_df, test_df, 20, balance_ranges_2, age_ranges_2)
            
            result_ratios_p7, groups2 = clustering_age_balance_not_grouped(train_df, test_df, 20, balance_ranges_2, age_ranges_2)

            result_ratios_p8, groups3 = new_approach_ratio_grouping_percentage(mkt_df_filtered, train_index, test_index, 500, age_ranges_2, balance_ranges_2)
            
            # Add all results together for this fold.
            
            phase_batch_key = str(j) + "_" + str(i)
            phase_batch[phase_batch_key] = {'p1':result_ratios_p1, 'p2':result_ratios_p2, 
                                            'p4':result_ratios_p4, 'p5':result_ratios_p5,
                                            'p6':result_ratios_p6, 'p7':result_ratios_p7, 
                                            'p8':result_ratios_p8, 'p9':result_ratios_p9}

        with open('mod_2_' + str(cp) +'.json', 'w') as fp:
                json.dump(phase_batch, fp)

At fold number:  1
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Call all Customers Approach
Greedy Approach
Gradient Ascent Approach
Performing initial update .. 
Perforiming sort .. 
Finished sort .. in while loop 
Upper Bound Approach
HDBScan Clustering No Ratio - Using Approximate Predict Function from HDBScan Library
Check 1
Check 2
Check 3
Check 4
HDBScan Clustering No Ratio - Using Approximate Predict Function from HDBScan Library
Check 1
Check 2
Check 3
Check 4
Step 1
Step 2
72
Step 3
Step 4
We missed: 163
Step 5
At fold number:  2
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Invalid Convex Hull Assignment
Call all Customers Approach
Greedy Approach
Gradient Ascent Approa

In [None]:
me = {'1':{'ratio':0.25}, '2':{'ratio':0.59}}
sorted(me.items(), key=lambda fs: fs[1]['ratio'], reverse = True)

In [18]:
with open('13082020' + str(cp) +'.json', 'w') as fp:
    json.dump(phase_batch, fp)

In [17]:
for key in phase_batch.keys():
    print(key)
    new_ratios = []
    for el in phase_batch[key]['p8']:
        new_ratios.append((int(el[0]), int(el[1])))
    phase_batch[key]['p8'] = new_ratios 

1_1
1_2
1_3
1_4
1_5


In [16]:
for key in phase_batch.keys():
    print(key)

1_1
1_2
1_3
1_4
1_5
