In [None]:
# Imports

import os
import math
import random
import operator
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math, itertools
import statistics

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from operator import itemgetter
from statistics import mean
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
# from ipynb.fs.full.helper_fns import *

In [None]:
# Helper Functions

def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df
  
    
def plot_graph_new(results, max_calls, list_passed, title):
    x_pts = [i+1 for i in range(0, max_calls)]
    if list_passed:
        y_pts = results
    else:    
        y_pts = [results[i]['expected'] for i in range(0, max_calls)]
    plt.title(title)
    plt.plot(x_pts, y_pts)
    plt.axvline(x=0, color ="black")
    plt.axhline(y=0, color ="black")
    plt.xticks(np.arange(1, max_calls+1, 1))
    plt.show()
    

# def plot_graph_both_axes(ratios, no_calls):
    

def div(a,b):
    if int(b) == 0:
        return 0.0
    else:
        return a/b
    

# Used for creating all possible combinations of the features.
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = iterable
    return itertools.chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


def convert(list): 
    return tuple(list) 


def construct_dict(feature_comb):
    new_dict = {}
    new_dict['education'] = convert(feature_comb[0])
    new_dict['job'] = convert(feature_comb[2])
    new_dict['marital'] = convert(feature_comb[1])
    new_dict['default'] = convert(feature_comb[3])
    new_dict['loan'] = convert(feature_comb[4])
    new_dict['housing'] = convert(feature_comb[5])
    return new_dict


# This was the old metric (reward per call rate).
# def compute_expected_reward_feature_set_new(fs_df, no_calls_considered):
#     expected_values_calls = []
#     len_df = len(fs_df)
#     for i in range(1, no_calls_considered + 1):
#         expected_values_calls.append({'neg_value':0.0, 'pos_value':0.0, 'count':0, 'expected':0.0})
#         for index, row in fs_df.iterrows():
#             no_calls = row['campaign']
#             if no_calls <= i:
#                 if row['y'] == "yes":
#                     expected_values_calls[i-1]['pos_value'] += ((no_calls_considered+1) - no_calls)
#                 else:
#                     expected_values_calls[i-1]['neg_value'] += (-no_calls)
#             else:
#                 expected_values_calls[i-1]['neg_value'] += (-i)
#             expected_values_calls[i-1]['count'] += 1
#     for loc, item in enumerate(expected_values_calls):
#         expected_values_calls[loc]['expected'] = (expected_values_calls[loc]['pos_value'] + expected_values_calls[loc]['neg_value'])/len_df
#     return expected_values_calls


# This is the new metric (success per call rate).
def compute_expected_succ_per_call_rate_feature_set(fs_df, no_calls_considered):
    expected_values_call_nums = []
    for i in range(1, no_calls_considered + 1):
        expected_values_call_nums.append({'succ':0, 'total_calls':0, 'expected':0.0})
        for index, row in fs_df.iterrows():
            no_calls = row['campaign']
            if no_calls <= i:
                if row['y'] == "yes":
                    expected_values_call_nums[i-1]['succ'] += 1
                expected_values_call_nums[i-1]['total_calls'] += no_calls
            else:
                expected_values_call_nums[i-1]['total_calls'] += i
    for loc, item in enumerate(expected_values_call_nums):
        expected_values_call_nums[loc]['expected'] = div(item['succ'], item['total_calls'])
    return expected_values_call_nums


def compute_optimal_call_no(results):
    max_loc = max(range(len(results)), key=lambda index: results[index]['expected'])
    if max_loc == 0 and results[max_loc]['expected'] == 0.0:
        return -1
    return max_loc


# Given a dictionary of what attributes comprise a feature set, we can get all rows corresponding to this feature set.
def extract_rows_feature_set(fs_df, feature_labels = {'education':['tertiary', 'unknown'], 
                                                      'job':['management', 'technician', 'blue-collar'], 
                                                      'marital':['single'], 'default':['no'], 
                                                      'housing':['no'], 'loan':['no']}):
    for key in feature_labels:
        feature_labels_query_str = ''
        arr = feature_labels[key]
        for label in arr:
            feature_labels_query_str += (key + ' == "'+ label + '" | ')
        feature_labels_query_str = feature_labels_query_str[:-3]
        fs_df = fs_df.query(feature_labels_query_str)
    return fs_df


def find_matching_attribute_comb(row_value, all_combs):
    query = None
    for comb in all_combs:
        for item in comb:
            if item == row_value:
                query = comb
    return query


def find_matching_attribute_eval_age(row_value, values):
    query = None
    for index, comb in enumerate(values):
        res = comb.format(age = row_value)
        if eval(res):
            return index


def find_matching_attribute_eval_balance(row_value, values):
    query = None
    for index, comb in enumerate(values):
        res = comb.format(balance = row_value)
        if eval(res):
            return index


def compute_metric(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += row['campaign']
    return div(total_successes, total_calls)


def compute_metric_2(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += min(row['campaign'], )
    return div(total_successes, total_calls)


def compute_metric_for_each_attribute(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    for index, value in enumerate(all_values):
        v_query = "{0} == '{1}'".format(attrib, value)
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals


def compute_metric_for_each_attribute_range(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    query_strings = []
    for index, value in enumerate(all_values):
        v_query = "{0} >= {1} & {2} < {3}".format(attrib, value[0], attrib, value[1])
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
        query_strings.append(v_query)
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals, query_strings


def find_combinations(sub_attributes, ratios):
    num_iter = len(ratios)
    sil_scores = []
    # Making use of the K-Means algorithm ... number of centroids are from 2 to n-1.
    for clust_num in range(2, num_iter):
        kmeans = KMeans(n_clusters = clust_num)
        kmeans.fit(ratios.reshape(-1,1))
        results = kmeans.labels_
        sil_scores.append((silhouette_score(ratios.reshape(-1,1), results, metric='euclidean'), results, clust_num))
    # We make use of the silhouette score to determine the ideal number of centroids.
    sorted_sil_scores = sorted(sil_scores, key=lambda x: x[0], reverse = True)
    # We then use this ideal number of centroids to determine which sub attributes should be aggregated.
    joined_sub_attributes = []
    for i in range(0, sorted_sil_scores[0][2]):
        joined_sub_attributes.append([])
    join_list = sorted_sil_scores[0][1]
    for index, value in enumerate(join_list):
        pos = join_list[index]
        joined_sub_attributes[pos].append(sub_attributes[index])
    return_joined_sub_attributes = []
    for arr in joined_sub_attributes:
        similar_els_gp = []
        for item in arr:
            similar_els_gp.append(str(item))
        return_joined_sub_attributes.append(similar_els_gp)
#     print(return_joined_sub_attributes)
    return return_joined_sub_attributes

# The following is the format of the way in which this method should be called.
# find_combinations(['a', 'b', 'c', 'd'], np.array([1, 4, 7, 90]), "job").

In [None]:
%%time
# Code that sets up values to construct all possible feature combinations.

# Age query strings.
# age_query_strings = ['age < 26','age >= 26 & age <=60','age >60']
# age_query_strings = ['age >= 10 & age <= 32', 'age >= 33 & age <= 40', 'age >= 50 & age <= 59', 'age >= 60']

# These strings are used for queries.
age_query_strings = ['age >= 10 & age <= 34', 'age >= 35 & age <= 45', 'age >= 46']
balance_query_strings = ['balance <= 450','balance > 450']

# These strings are used for injecting values into the statement when using eval.
age_query_format_strings = ['{age} >= 10 and {age} <= 34', '{age} >= 35 and {age} <= 45', '{age} >= 46']
balance_query_format_strings = ['{balance} <= 450','{balance} > 450']

# Max call number to consider.
max_calls = 20

# Pull and filter all calls <= 20.
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls)]

print(mkt_df_filtered.shape)

In [None]:
%%time

baseline_ov = []
opt_no_res_ov = []
opt_res_ov = []

train_df = mkt_df_filtered
test_df = mkt_df_filtered

# At this point, we can run computations for the success rate of each sub attribute and join
# the sub-attributes based on the output of k-means.
poss = []

# Education
all_ed = ['tertiary', 'secondary', 'primary', 'unknown']
metric_vals = compute_metric_for_each_attribute(all_ed, train_df, 'education')
education_cmbs = find_combinations(all_ed, metric_vals)

# Occupation
all_jobs = ['student', 'retired', 'unemployed', 'admin.', 'management', 'self-employed', 'technician', 'unknown', 'services', 'housemaid', 'blue-collar', 'entrepreneur']
metric_vals = compute_metric_for_each_attribute(all_jobs, train_df, 'job')
job_cmbs = find_combinations(all_jobs, metric_vals)

# Marital
all_ms = ['married', 'single', 'divorced', 'unknown']
metric_vals = compute_metric_for_each_attribute(all_ms, train_df, 'marital')
marital_cmbs = find_combinations(all_ms, metric_vals)

# Default
all_def = ['no', 'yes', 'unknown']
metric_vals = compute_metric_for_each_attribute(all_def, train_df, 'default')
default_cmbs = find_combinations(all_def, metric_vals)

# Loan
all_ln = ['no', 'yes', 'unknown']
metric_vals = compute_metric_for_each_attribute(all_ln, train_df, 'loan')
loan_cmbs = find_combinations(all_ln, metric_vals)

# Housing
all_hs = ['no', 'yes', 'unknown']
metric_vals = compute_metric_for_each_attribute(all_hs, train_df, 'housing')
housing_cmbs = find_combinations(all_hs, metric_vals)

poss.append(education_cmbs)
poss.append(marital_cmbs)
poss.append(job_cmbs)
poss.append(default_cmbs)
poss.append(loan_cmbs)
poss.append(housing_cmbs)
all_combs = list(itertools.product(*poss))

print("Number of combinations: ", len(all_combs)* len(age_query_strings) * len(balance_query_strings))

# We can now go ahead and genreate the feature sets based on what was done previously.

num_iter = 0
combs_to_consider = {}
# Setting up looping structures to generate all possibilities.
# All that has to be done now is to change 'df_train' to 'X_train'.
for age_query in age_query_strings:
    df_filtered_final = train_df.query(age_query)
    for bal_query in balance_query_strings:
        df_filtered_final_2 = df_filtered_final.query(bal_query)
        for comb in all_combs:
            dict_final_query = construct_dict(comb)
            num_iter += 1
            extracted_df = extract_rows_feature_set(df_filtered_final_2, dict_final_query)
            key = (dict_final_query['education'], dict_final_query['job'], dict_final_query['marital'], dict_final_query['default'], dict_final_query['loan'], dict_final_query['housing'], age_query, bal_query)
            n_rows = extracted_df.shape[0]
            if n_rows >0:
                results = compute_expected_succ_per_call_rate_feature_set(extracted_df, max_calls)
                max_loc = compute_optimal_call_no(results)
                if max_loc != -1:
                    combs_to_consider[key] = {'comb':comb, 'max_loc':max_loc + 1, 'best_rate':results[max_loc]['expected'], 'overall_rate':results[max_calls-1]['expected'], 'n_rows':n_rows}
                    
                else:
                    print("Invalid FS !")

all_possible_calls = []

num_missed = 0

for loc, row in test_df.iterrows():
    # Preprocessing step for optimal method.
    # We have the exact values for each of the following:
    jb_query = convert(find_matching_attribute_comb(str(row['job']), job_cmbs))
    mt_query = convert(find_matching_attribute_comb(str(row['marital']), marital_cmbs))
    ec_query = convert(find_matching_attribute_comb(str(row['education']), education_cmbs))
    house_query = convert(find_matching_attribute_comb(str(row['housing']), housing_cmbs))
    loan_query = convert(find_matching_attribute_comb(str(row['loan']), loan_cmbs))
    def_query = convert(find_matching_attribute_comb(str(row['default']), default_cmbs))
    # These matches are different.
    bal_query = balance_query_strings[find_matching_attribute_eval_balance(int(row['balance']), balance_query_format_strings)]
    age_query = age_query_strings[find_matching_attribute_eval_age(int(row['age']), age_query_format_strings)]
    no_calls = row['campaign']
    key_to_find = (ec_query, jb_query, mt_query, def_query, loan_query, house_query, age_query, bal_query)
    if key_to_find in combs_to_consider.keys():
        fs = combs_to_consider[key_to_find]
        all_possible_calls.append((fs['best_rate'], fs['overall_rate'], row['y'], row['campaign'], fs, key_to_find))
    else:
        num_missed += 1

# Baseline with shuffling of customers.
call_check_points = [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, 23500, 24000, 24500, 25000, 25500, 26000, 26500, 27000, 27500, 28000, 28500, 29000, 29500, 30000, 30500, 31000, 31500, 32000, 32500, 33000, 33500, 34000, 34500, 35000, 35500, 36000, 36500, 37000, 37500, 38000, 38500, 39000, 39500, 40000, 40500, 41000, 41500, 42000, 42500, 43000, 43500, 44000, 44500, 45000, 45500, 46000, 46500, 47000, 47500, 48000, 48500, 49000, 49500, 50000, 50500, 51000, 51500, 52000, 52500, 53000, 53500, 54000, 54500, 55000, 55500, 56000, 56500, 57000, 57500, 58000, 58500, 59000, 59500, 60000, 60500, 61000, 61500, 62000, 62500, 63000, 63500, 64000, 64500, 65000, 65500, 66000, 66500, 67000, 67500, 68000, 68500, 69000, 69500, 70000, 70500, 71000, 71500, 72000, 72500, 73000, 73500, 74000, 74500, 75000, 75500, 76000, 76500, 77000, 77500, 78000, 78500, 79000, 79500, 80000, 80500, 81000, 81500, 82000, 82500, 83000, 83500, 84000, 84500, 85000, 85500, 86000, 86500, 87000, 87500, 88000, 88500, 89000, 89500, 90000, 90500, 91000, 91500, 92000, 92500, 93000, 93500, 94000, 94500, 95000, 95500, 96000, 96500, 97000, 97500, 98000, 98500, 99000, 99500, 100000, 100500, 101000, 101500, 102000, 102500, 103000, 103500, 104000, 104500, 105000, 105500, 106000, 106500, 107000, 107500, 108000, 108500, 109000, 109500, 110000, 110500, 111000, 111500, 112000, 112500, 113000, 113500, 114000, 114500, 115000, 115500, 116000, 116500, 117000, 117500, 118000, 118500, 119000, 119500, 120000, 120500, 121000, 121500, 122000, 122500, 123000, 123500, 124000, 124500, 125000, 125500, 126000, 126500, 127000, 127500, 128000, 128500, 129000, 129500, 130000, 130500, 131000, 131500, 132000, 132500, 133000, 133500, 134000, 134500, 135000, 135500, 136000, 136500, 137000, 137500, 138000, 138500, 139000, 139500]
s_c_ratio_baseline_cp = []
num_succ = 0
num_calls = 0
cp_loc = 0
res = test_df.reindex(np.random.permutation(test_df.index))
for loc, row in res.iterrows():
    if num_calls >= call_check_points[cp_loc]:
        cp_loc += 1
        s_c_ratio_baseline_cp.append(div(num_succ, num_calls))
    num_calls += row['campaign']
    if row['y'] == "yes":
        num_succ += 1
s_c_ratio_baseline_cp.append(div(num_succ, num_calls))

# Optimal with no restrictions on #calls for each feature set.
all_possible_calls_sorted_overall = sorted(all_possible_calls, key = lambda tup: tup[1], reverse = True)
s_c_ratio_opt_no_res = []
num_calls = 0
num_succ = 0
prev_fs = all_possible_calls_sorted_overall[0][5]
for item in all_possible_calls_sorted_overall:
    if prev_fs != item[5]:
        s_c_ratio_opt_no_res.append((div(num_succ, num_calls), num_calls))
        prev_fs = item[5]
    num_calls += item[3]
    if item[2] == "yes":
        num_succ += 1
s_c_ratio_opt_no_res.append((div(num_succ, num_calls), num_calls))

# Optimal with restrictions on #calls for each feature set.
all_possible_calls_sorted_best = sorted(all_possible_calls, key = lambda tup: tup[0], reverse = True)
s_c_ratio_opt_res = []
num_calls = 0
num_succ = 0
num_bad = 0
saved = 0
prev_fs = all_possible_calls_sorted_best[0][5]
for item in all_possible_calls_sorted_best:
    user_outcome = item[2]
    user_calls = item[3]
    # max_no_calls_fs = item[4]['max_loc']
    max_no_calls_fs = 5
    if user_calls >= 6:
        saved += 1
    if prev_fs != item[5]:
        s_c_ratio_opt_res.append((div(num_succ, num_calls), num_calls))
        prev_fs = item[5]
    if user_outcome == "yes" and user_calls <= max_no_calls_fs:
        num_succ += 1
        num_calls += user_calls
    elif user_outcome == "yes" and user_calls > max_no_calls_fs:
        num_calls += max_no_calls_fs
        num_bad += 1
    elif user_outcome == "no" and user_calls <= max_no_calls_fs:
        num_calls += user_calls
    elif user_outcome == "no" and user_calls > max_no_calls_fs:
        num_calls += max_no_calls_fs
s_c_ratio_opt_res.append((div(num_succ, num_calls), num_calls))

print("Potentially Bad: ", num_bad)
print("Saved excessive calls: ", saved)

baseline_ov.append(s_c_ratio_baseline_cp)
opt_no_res_ov.append(s_c_ratio_opt_no_res)
opt_res_ov.append(s_c_ratio_opt_res)
        
x = np.array(baseline_ov)
y = np.array(opt_no_res_ov)
z = np.array(opt_res_ov)

np.save("baseline_ov.npy", x)
np.save("opt_no_res_ov.npy", y)
np.save("opt_res_ov.npy", z)

In [None]:
s_c_ratio_opt_res
# Ordering all calls with max_calls in mind.

In [None]:
s_c_ratio_opt_no_res
# Ordering all calls.

In [None]:
len(all_combs)