In [39]:
# Imports

import os
import math
import random
import operator
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math, itertools
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from operator import itemgetter
from statistics import mean

In [7]:
# Helper Functions


def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df
  
    
def plot_graph_new(results, max_calls):
    x_pts = [i+1 for i in range(0, max_calls)]
    y_pts = [results[i]['expected'] for i in range(0, max_calls)]
    plt.title("Plot of Expected Success Per Call Rate")
    plt.plot(x_pts, y_pts)
    plt.axvline(x=0, color ="black")
    plt.axhline(y=0, color ="black")
    plt.xticks(np.arange(1, max_calls+1, 1))
    plt.show()
    

def div(a,b):
    if int(b) == 0:
        return 0.0
    else:
        return a/b
    

# Used for creating all possible combinations of the features.
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = iterable
    return itertools.chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


def construct_dict(feature_comb):
    new_dict = {}
    new_dict['education'] = feature_comb[0]
    new_dict['job'] = feature_comb[2]
    new_dict['marital'] = feature_comb[1]
    new_dict['default'] = feature_comb[3]
    new_dict['loan'] = feature_comb[4]
    new_dict['housing'] = feature_comb[5]
    return new_dict


# This was the old metric (reward per call rate).
# def compute_expected_reward_feature_set_new(fs_df, no_calls_considered):
#     expected_values_calls = []
#     len_df = len(fs_df)
#     for i in range(1, no_calls_considered + 1):
#         expected_values_calls.append({'neg_value':0.0, 'pos_value':0.0, 'count':0, 'expected':0.0})
#         for index, row in fs_df.iterrows():
#             no_calls = row['campaign']
#             if no_calls <= i:
#                 if row['y'] == "yes":
#                     expected_values_calls[i-1]['pos_value'] += ((no_calls_considered+1) - no_calls)
#                 else:
#                     expected_values_calls[i-1]['neg_value'] += (-no_calls)
#             else:
#                 expected_values_calls[i-1]['neg_value'] += (-i)
#             expected_values_calls[i-1]['count'] += 1
#     for loc, item in enumerate(expected_values_calls):
#         expected_values_calls[loc]['expected'] = (expected_values_calls[loc]['pos_value'] + expected_values_calls[loc]['neg_value'])/len_df
#     return expected_values_calls


# This is the new metric (success per call rate).
def compute_expected_succ_per_call_rate_feature_set(fs_df, no_calls_considered):
    expected_values_call_nums = []
    for i in range(1, no_calls_considered + 1):
        expected_values_call_nums.append({'succ':0, 'total_calls':0, 'expected':0.0})
        for index, row in fs_df.iterrows():
            no_calls = row['campaign']
            if no_calls <= i:
                if row['y'] == "yes":
                    expected_values_call_nums[i-1]['succ'] += 1
                    expected_values_call_nums[i-1]['total_calls'] += no_calls
            else:
                expected_values_call_nums[i-1]['total_calls'] += i
    for loc, item in enumerate(expected_values_call_nums):
        expected_values_call_nums[loc]['expected'] = div(item['succ'], item['total_calls'])
    return expected_values_call_nums


def compute_optimal_call_no(results):
    max_loc = max(range(len(results)), key=lambda index: results[index]['expected'])
    if results[max_loc]['expected'] < 0.0:
        return -1
    else:
        return max_loc


# Given a dictionary of what attributes comprise a feature set, we can get all rows corresponding to this feature set.
def extract_rows_feature_set(fs_df, feature_labels = {'education':['tertiary', 'unknown'], 
                                                      'job':['management', 'technician', 'blue-collar'], 
                                                      'marital':['single'], 'default':['no'], 
                                                      'housing':['no'], 'loan':['no']}):
    for key in feature_labels:
        feature_labels_query_str = ''
        arr = feature_labels[key]
        for label in arr:
            feature_labels_query_str += (key + ' == "'+ label + '" | ')
        feature_labels_query_str = feature_labels_query_str[:-3]
        fs_df = fs_df.query(feature_labels_query_str)
    return fs_df

In [8]:
# Code that sets up values to construct all possible feature combinations.

poss = []

poss.append([['tertiary', 'unknown'],['primary','secondary']])
poss.append([['single'],['married'],['divorced']])
poss.append([['student','retired','unemployed'],['admin', 'management', 'self-employed'],['technician', 'unknown', 'services'],['housemaid', 'blue-collar', 'entrepreneur']])
poss.append([['no'],['yes']])
poss.append([['no'],['yes']])
poss.append([['no'],['yes']])
all_combs = list(itertools.product(*poss))

# Age query strings.
age_query_strings = ['age < 26','age >= 26 & age <=60','age >60']

# Balance query strings.
balance_query_strings = ['balance <= 5000',' balance > 5000']

# Max call number to consider.
# We obtained this by dividing the overall reward we have in the dataset by the number of successes.
max_calls = 20

In [9]:
# Main code for computing info regarding each feature set.

# Pull and filter all calls <= 20.
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls)]


# Splitting into train(80%) and test(20%) sets.
X = mkt_df_filtered.iloc[:,0:len(mkt_df_filtered.columns)-1]
y = mkt_df_filtered.iloc[:,-1]
rs = ShuffleSplit(n_splits=1, test_size=.25, random_state=0)
for train_indicies, test_indicies in rs.split(X):
    df_train = mkt_df_filtered.iloc[train_indicies]
    df_test = mkt_df_filtered.iloc[test_indicies]


# Setting up looping structures to generate all possibilities.
num_iter = 0
num_non_zero_combs = 0

combs_to_consider = []

for age_query in age_query_strings:
    df_filtered_final = df_train.query(age_query)
    for bal_query in balance_query_strings:
        df_filtered_final = df_filtered_final.query(bal_query)
        for comb in all_combs:
            dict_final_query = construct_dict(comb)
            num_iter += 1
            extracted_df = extract_rows_feature_set(df_filtered_final, dict_final_query)
            if extracted_df.shape[0] != 0:
                num_non_zero_combs += 1
                results = compute_expected_succ_per_call_rate_feature_set(extracted_df, max_calls)
                max_loc = compute_optimal_call_no(results)
                if max_loc >= 0:
                    combs_to_consider.append({'age':age_query, 'bal':bal_query, 'comb':comb, 'consider':True, 'max_loc':max_loc, 'rate':results[max_loc]['expected']})
                else:
                    combs_to_consider.append({'age':age_query, 'bal':bal_query, 'comb':comb, 'consider':False})
#                 print(age_query)
#                 print(bal_query)
#                 print(dict_final_query)
#                 print("Max Loc is: ", max_loc+1)
#                 plot_graph_new(results, max_calls)
#                 print("\n\n\n")

In [13]:
def compute_metric(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += row['campaign']
    return div(total_successes, total_calls)

In [89]:
# Metric using our method.
filtered_combs_valid = [comb for comb in combs_to_consider if comb['consider'] == True]
filtered_combs_valid_sorted = sorted(filtered_combs_valid, key=lambda k: k['rate'], reverse = True) 
print("All combs: ", mean([comb['rate'] for comb in combs_to_consider]))
print("All combs(ratio >=0.20): ", mean([comb['rate'] for comb in combs_to_consider if comb['rate'] >= 0.20]))
# The ratio will be increased further when we filter the unwanted feature combinations.

All combs:  0.3513100131570743
All combs(ratio >=0.20):  0.5514576239792396


In [35]:
# Metric for the filtered dataset (only up to 20 calls).
print(compute_metric(mkt_df_filtered))

0.04469269018705815


In [36]:
# Metric for the entire dataset (up to 56 calls).
print(compute_metric(mkt_df))

0.042326899068472104


#### This section computes the metric for the purpose of determining the groupings within each feature (  ..... need to finish for the remainder of features).

In [21]:
def compute_metric_for_each_attribute(all_values, df, attrib):
    for value in all_values:
        v_query = "{0} == '{1}'".format(attrib, value)
        print(v_query, compute_metric(mkt_df_filtered.query(v_query)))

In [40]:
# Age.
all_age_query_strings = ['age >= 10 & age <= 19', 'age >= 20 & age <= 29', 'age >= 30 & age <= 39', 'age >= 40 & age <= 49', 'age >= 50 & age <= 59','age >= 60 & age <= 69', 'age >= 70 & age <= 79', 'age >= 80']
for age_query in all_age_query_strings:
    df_filtered_final = mkt_df_filtered.query(age_query)
    print(age_query, compute_metric(df_filtered_final))

age >= 10 & age <= 19 0.14634146341463414
age >= 20 & age <= 29 0.07631601041054488
age >= 30 & age <= 39 0.040097307272879794
age >= 40 & age <= 49 0.03357253501090633
age >= 50 & age <= 59 0.03437390389337075
age >= 60 & age <= 69 0.12427647259107934
age >= 70 & age <= 79 0.20594965675057209
age >= 80 0.1950354609929078


In [77]:
# Occupation.
all_jobs = ['student', 'retired', 'unemployed', 'admin', 'management', 'self-employed', 'technician', 'unknown', 'services', 'housemaid', 'blue-collar', 'entrepreneur']
compute_metric_for_each_attribute(all_jobs, mkt_df_filtered, 'job')

job == 'student' 0.12907869481765835
job == 'retired' 0.09875598086124401
job == 'unemployed' 0.06631648063033486
job == 'admin' 0.0
job == 'management' 0.05079848502596541
job == 'self-employed' 0.04410377358490566
job == 'technician' 0.04040647274128299
job == 'unknown' 0.0379041248606466
job == 'services' 0.03470001880759827
job == 'housemaid' 0.0319243275199527
job == 'blue-collar' 0.02749087520385183
job == 'entrepreneur' 0.030182090296832127


In [14]:
# Marital Status
all_ms = ['married', 'single', 'unknown']
compute_metric_for_each_attribute(all_ms, mkt_df_filtered, 'marital')

marital == 'married' 0.03762389773737097
marital == 'single' 0.05973928537934915
marital == 'unknown' 0.0


In [22]:
# Education
all_ed = ['tertiary', 'secondary', 'primary', 'unknown']
compute_metric_for_each_attribute(all_ed, mkt_df_filtered, 'education')

education == 'tertiary' 0.05604090002528161
education == 'secondary' 0.0410958904109589
education == 'primary' 0.03222282905516111
education == 'unknown' 0.05277486910994764


In [23]:
# Default
all_def = ['no', 'yes', 'unknown']
compute_metric_for_each_attribute(all_def, mkt_df_filtered, 'default')

default == 'no' 0.04515566754107414
default == 'yes' 0.021996615905245348
default == 'unknown' 0.0


In [24]:
# Housing
all_hs = ['no', 'yes', 'unknown']
compute_metric_for_each_attribute(all_hs, mkt_df_filtered, 'housing')

housing == 'no' 0.061309197293838
housing == 'yes' 0.030395519335452
housing == 'unknown' 0.0


In [25]:
# Loan
all_ln = ['no', 'yes', 'unknown']
compute_metric_for_each_attribute(all_ln, mkt_df_filtered, 'loan')

loan == 'no' 0.048405306237651706
loan == 'yes' 0.025356992860142796
loan == 'unknown' 0.0


In [29]:
# Balance
all_bal_query_strings = ['balance >= -100000 & age <= -1', 'balance >= 0 & balance < 1000', 'balance >= 1000 & balance < 2000', 'balance >= 2000 & balance < 3000', 'balance >= 3000 & balance < 4000','balance >= 4000 & balance < 5000', 'balance >= 5000 & balance < 6000', 'balance >= 6000 & balance < 7000', 'balance >= 7000 & balance < 8000', 'balance >= 8000 & balance < 9000', 'balance >= 9000 & balance < 10000','balance >= 10000 & balance < 11000', 'balance >= 11000 & balance < 12000', 'balance >= 12000 & balance < 13000', 'balance >= 13000 & balance < 14000', 'balance >= 14000 & balance < 15000', 'balance >= 15000 & balance < 16000', 'balance >= 16000 & balance < 17000','balance >= 17000 & balance < 18000', 'balance >= 18000 & balance < 19000', 'balance >= 19000 & balance < 19000', 'balance >= 20000']
for bal_query in all_bal_query_strings:
    df_filtered_final = mkt_df_filtered.query(bal_query)
    print(bal_query, compute_metric(df_filtered_final))

balance >= -100000 & age <= -1 0.0
balance >= 0 & balance < 1000 0.03984136840916789
balance >= 1000 & balance < 2000 0.05351934051997464
balance >= 2000 & balance < 3000 0.0680968858131488
balance >= 3000 & balance < 4000 0.0689893862482695
balance >= 4000 & balance < 5000 0.06461086637298091
balance >= 5000 & balance < 6000 0.06534547402249598
balance >= 6000 & balance < 7000 0.04906542056074766
balance >= 7000 & balance < 8000 0.0671217292377702
balance >= 8000 & balance < 9000 0.052269601100412656
balance >= 9000 & balance < 10000 0.058823529411764705
balance >= 10000 & balance < 11000 0.09217877094972067
balance >= 11000 & balance < 12000 0.08365019011406843
balance >= 12000 & balance < 13000 0.09821428571428571
balance >= 13000 & balance < 14000 0.035897435897435895
balance >= 14000 & balance < 15000 0.058823529411764705
balance >= 15000 & balance < 16000 0.03418803418803419
balance >= 16000 & balance < 17000 0.0
balance >= 17000 & balance < 18000 0.028985507246376812
balance >= 