In [5]:
# Imports

import os
import math
import random
import operator
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math, itertools
import statistics

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from operator import itemgetter
from statistics import mean
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
from ipynb.fs.full.helper_fns import *

In [2]:
# Code that sets up values to construct all possible feature combinations.

# Age query strings.
# age_query_strings = ['age < 26','age >= 26 & age <=60','age >60']
age_query_strings = ['age >= 10 & age <= 32', 'age >= 33 & age <= 40', 'age >= 50 & age <= 59', 'age >= 60']

# Balance query strings.
balance_query_strings = ['balance <= 5000',' balance > 5000']

# Max call number to consider.
max_calls = 20

In [3]:
# Main code ... orchestrates everything!

# Pull and filter all calls <= 20.
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls)]

print(mkt_df_filtered.shape)

# Splitting dataframe into data and result dataframes.
X = mkt_df_filtered.iloc[:,0:len(mkt_df_filtered.columns)-1]
y = mkt_df_filtered.iloc[:,-1]   

all_non_optimal_ratios = []
all_optimal_ratios = []
all_non_optimal_calls = []
all_optimal_calls = []

i = 0
kf = KFold(n_splits=20, shuffle=False)

for train_index, test_index in kf.split(X):
    i += 1
    
    train_df = mkt_df_filtered.iloc[train_index]
    test_df = mkt_df_filtered.iloc[test_index]
    
    # At this point, we can run computations for the success rate of each sub attribute and join
    # the sub-attributes based on the output of k-means.
    poss = []
    
    # Education.
    all_ed = ['tertiary', 'secondary', 'primary', 'unknown']
    metric_vals = compute_metric_for_each_attribute(all_ed, train_df, 'education')
    education_cmbs = find_combinations(all_ed, metric_vals)
    
    # Occupation.
    all_jobs = ['student', 'retired', 'unemployed', 'admin.', 'management', 'self-employed', 'technician', 'unknown', 'services', 'housemaid', 'blue-collar', 'entrepreneur']
    metric_vals = compute_metric_for_each_attribute(all_jobs, train_df, 'job')
    job_cmbs = find_combinations(all_jobs, metric_vals)
    
    # Marital.
    all_ms = ['married', 'single', 'divorced', 'unknown']
    metric_vals = compute_metric_for_each_attribute(all_ms, train_df, 'marital')
    marital_cmbs = find_combinations(all_ms, metric_vals)
    
    # Default
    all_def = ['no', 'yes', 'unknown']
    metric_vals = compute_metric_for_each_attribute(all_def, train_df, 'default')
    default_cmbs = find_combinations(all_def, metric_vals)
    
    # Loan
    all_ln = ['no', 'yes', 'unknown']
    metric_vals = compute_metric_for_each_attribute(all_ln, train_df, 'loan')
    loan_cmbs = find_combinations(all_ln, metric_vals)
    
    # Housing
    all_hs = ['no', 'yes', 'unknown']
    metric_vals = compute_metric_for_each_attribute(all_hs, train_df, 'housing')
    housing_cmbs = find_combinations(all_hs, metric_vals)
    
    poss.append(education_cmbs)
    poss.append(marital_cmbs)
    poss.append(job_cmbs)
    poss.append(default_cmbs)
    poss.append(loan_cmbs)
    poss.append(housing_cmbs)
    all_combs = list(itertools.product(*poss))
    
    print("Number of combinations: ", len(all_combs)* len(age_query_strings) * len(balance_query_strings))
    
    # We can now go ahead and genreate the feature sets based on what was done previously.
    
    num_iter = 0
    combs_to_consider = {}
    # Setting up looping structures to generate all possibilities.
    # All that has to be done now is to change 'df_train' to 'X_train'.
    for age_query in age_query_strings:
        df_filtered_final = train_df.query(age_query)
        for bal_query in balance_query_strings:
            df_filtered_final = df_filtered_final.query(bal_query)
            for comb in all_combs:
#                 print(comb)
                dict_final_query = construct_dict(comb)
#                 print(dict_final_query)
                num_iter += 1
                extracted_df = extract_rows_feature_set(df_filtered_final, dict_final_query)
                key = (dict_final_query['education'], dict_final_query['job'], dict_final_query['marital'], dict_final_query['default'], dict_final_query['loan'], dict_final_query['housing'], bal_query, age_query)
                if extracted_df.shape[0] != 0:
                    num_non_zero_combs += 1
                    results = compute_expected_succ_per_call_rate_feature_set(extracted_df, max_calls)
                    max_loc = compute_optimal_call_no(results)
                    rate = results[max_loc]['expected']
                    # In this new case max_loc never goes below zero!
                    if max_loc >= 0:
                        combs_to_consider[key] = {'age':age_query, 'bal':bal_query, 'comb':comb, 'consider':True, 'max_loc':max_loc, 'rate':results[max_loc]['expected']}
#                     print(age_query)
#                     print(bal_query)
#                     print(comb)
#                     print(dict_final_query)
#                     print("Max Loc is: ", max_loc+1)
#                     plot_graph_new(results, max_calls)
#                     print("\n\n\n")
#  When we are finished creating the feature combinations .... we can now use the hold out set for validation of the model!
    print("Iteration: ", i)
    num_succ = 0
    num_calls = 0
    num_succ_optimal = 0
    num_calls_optimal = 0
    num_bad_cons = 0
    num_good_cons = 0 
    for loc, row in test_df.iterrows():
        # For regular method.
        if row['y'] == "yes":
            num_succ += 1
        num_calls += row['campaign']
#         -----------------------------------------------------
        # For optimal method.
        # We have the exact values for each of the following:
        jb_query = convert(find_matching_attribute_comb(str(row['job']), job_cmbs))
        mt_query = convert(find_matching_attribute_comb(str(row['marital']), marital_cmbs))
        ec_query = convert(find_matching_attribute_comb(str(row['education']), education_cmbs))
        house_query = convert(find_matching_attribute_comb(str(row['housing']), housing_cmbs))
        loan_query = convert(find_matching_attribute_comb(str(row['loan']), loan_cmbs))
        def_query = convert(find_matching_attribute_comb(str(row['default']), default_cmbs))
        ##########################
        no_calls = row['campaign']
        # The balance and age are within ranges so we need to find the matching query.
        ##########################
        balance = row['balance']
        bal_query = None
        age = row['age']
        age_query = None
        for age_q in age_query_strings:
            if eval(age_q):
                age_query = age_q
        for bal_q in balance_query_strings:
            if eval(bal_q):
                bal_query = bal_q
        key_to_find = (ec_query, jb_query, mt_query, def_query, loan_query, house_query, bal_query, age_query)
        if key_to_find in combs_to_consider.keys():
            fs = combs_to_consider[key_to_find]
            if fs['rate'] >= 0.5:
                if row['y'] == "yes":
                    if no_calls <= fs['max_loc'] +1:
                        num_succ_optimal += 1
                        num_calls_optimal += no_calls
                    else:
                        num_bad_cons += 1
                        num_calls_optimal += fs['max_loc'] +1
                else:
                    if no_calls <= fs['max_loc'] +1:
                        num_calls_optimal += no_calls
                    else:
                        num_good_cons += 1
                        num_calls_optimal += fs['max_loc'] +1
    
    all_non_optimal_ratios.append(num_succ/num_calls)
    all_optimal_ratios.append(num_succ_optimal/num_calls_optimal)
    all_non_optimal_calls.append(num_calls)
    all_optimal_calls.append(num_calls_optimal)
    print("Non-Optimized: ", num_succ, num_calls, num_succ/num_calls)
    print("Optimized: ", num_succ_optimal, num_calls_optimal, num_succ_optimal/num_calls_optimal)
    print("Bad - In FS: ", num_bad_cons)
    print("Good - In FS: ", num_good_cons)
    print("Num Rows: ", len(test_df))
    print("\n")   

(44967, 17)


KeyboardInterrupt: 

In [48]:
mean_non_optimal = statistics.mean(all_non_optimal_ratios)
mean_optimal = statistics.mean(all_optimal_ratios)
print("Mean Non-Optimal: ", mean_non_optimal)
print("Mean Optimal: ", mean_optimal)

Mean Non-Optimal:  0.05399037152894486
Mean Optimal:  0.07614002096624461


In [29]:
age = 45
for age_q in age_query_strings:
    if eval(age_q):
        print("Yes!")
        print(age_q)

Yes!
age >= 26 & age <=60


#### This section computes the metric for the purpose of determining the groupings within each feature (  ..... need to finish for the remainder of features).

In [37]:
# Age.
all_age_query_strings = ['age >= 10 & age <= 32', 'age >= 33 & age <= 40', 'age >= 50 & age <= 59', 'age >= 60']
for age_query in all_age_query_strings:
    df_filtered_final = mkt_df_filtered.query(age_query)
    print(df_filtered_final.shape)
    print(age_query, compute_metric(df_filtered_final))

(11039, 17)
age >= 10 & age <= 32 0.057239670625956425
(13533, 17)
age >= 33 & age <= 40 0.0387338725476477
(8375, 17)
age >= 50 & age <= 59 0.03437390389337075
(1783, 17)
age >= 60 0.1465917419985341


In [None]:
# Age.
all_age_query_strings = ['age >= 10 & age <= 19', 'age >= 20 & age <= 29', 'age >= 30 & age <= 39', 'age >= 40 & age <= 49', 'age >= 50 & age <= 59','age >= 60 & age <= 69', 'age >= 70 & age <= 79', 'age >= 80']
for age_query in all_age_query_strings:
    df_filtered_final = mkt_df_filtered.query(age_query)
    print(df_filtered_final.shape)
    print(age_query, compute_metric(df_filtered_final))

In [4]:
# Balance
all_bal_query_strings = ['balance >= -100000 & balance <= -1', 'balance >= 0 & balance < 1000', 'balance >= 1000 & balance < 2000', 'balance >= 2000 & balance < 3000', 'balance >= 3000 & balance < 4000','balance >= 4000 & balance < 5000', 'balance >= 5000 & balance < 6000', 'balance >= 6000 & balance < 7000', 'balance >= 7000 & balance < 8000', 'balance >= 8000 & balance < 9000', 'balance >= 9000 & balance < 10000','balance >= 10000 & balance < 11000', 'balance >= 11000 & balance < 12000', 'balance >= 12000 & balance < 13000', 'balance >= 13000 & balance < 14000', 'balance >= 14000 & balance < 15000', 'balance >= 15000 & balance < 16000', 'balance >= 16000 & balance < 17000','balance >= 17000 & balance < 18000', 'balance >= 18000 & balance < 19000', 'balance >= 19000 & balance < 19000', 'balance >= 20000']
for bal_query in all_bal_query_strings:
    df_filtered_final = mkt_df_filtered.query(bal_query)
    print(bal_query, len(df_filtered_final), compute_metric(df_filtered_final))


balance >= -100000 & balance <= -1 3731 0.020823004462072386
balance >= 0 & balance < 1000 26650 0.03984136840916789
balance >= 1000 & balance < 2000 6114 0.05351934051997464
balance >= 2000 & balance < 3000 2880 0.0680968858131488
balance >= 3000 & balance < 4000 1708 0.0689893862482695
balance >= 4000 & balance < 5000 1049 0.06461086637298091
balance >= 5000 & balance < 6000 735 0.06534547402249598
balance >= 6000 & balance < 7000 481 0.04906542056074766
balance >= 7000 & balance < 8000 341 0.0671217292377702
balance >= 8000 & balance < 9000 290 0.052269601100412656
balance >= 9000 & balance < 10000 162 0.058823529411764705
balance >= 10000 & balance < 11000 151 0.09217877094972067
balance >= 11000 & balance < 12000 108 0.08365019011406843
balance >= 12000 & balance < 13000 95 0.09821428571428571
balance >= 13000 & balance < 14000 80 0.035897435897435895
balance >= 14000 & balance < 15000 42 0.058823529411764705
balance >= 15000 & balance < 16000 42 0.03418803418803419
balance >= 160

In [6]:
# Balance
all_bal_query_strings = [(-10000, 0), (0, 1000), (1000, 2000), (2000, 3000)]
compute_metric_for_each_attribute_range(all_bal_query_strings, train_df, 'balance')

NameError: name 'compute_metric_for_each_attribute_range' is not defined

In [12]:
# This is to determine the maximum number of calls we should stop at!
all_ratios_calls = []
for i in range(1,57):
    query_str = 'campaign == ' + str(i)
    call_query_data = mkt_df_filtered.query(query_str)
    succ = 0
    calls = 0
    for lc, rw in call_query_data.iterrows():
        if rw['y'] == "yes":
            succ += 1
        calls += rw['campaign']
    all_ratios_calls.append(div(succ, calls))
for index, value in enumerate(all_ratios_calls):
    print(index+1, value)
plot_graph_new(all_ratios_calls, 56, True)

[0.14597583219334245, 0.056017592962814874, 0.037312081144720156, 0.022501419647927314, 0.015759637188208615, 0.011877097856958431, 0.00913508260447036, 0.007407407407407408, 0.007135575942915392, 0.005263157894736842, 0.007236544549977386, 0.002150537634408602, 0.003470213996529786, 0.0030721966205837174, 0.0031746031746031746, 0.0015822784810126582, 0.005115089514066497, 0.0, 0.0, 0.0011627906976744186, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
