In [1]:
# Imports

import os
import math
import random
import operator as op
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math, itertools
import statistics
import json
import hdbscan

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics.pairwise import euclidean_distances
from operator import itemgetter
from statistics import mean
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
from matplotlib.ticker import StrMethodFormatter

from scipy.spatial import ConvexHull

In [2]:
# Helper Functions

def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df
  
    
def plot_graph_new(results, max_calls, list_passed, title, name = "1"):
    x_pts = [i+1 for i in range(0, max_calls)]
    if list_passed:
        y_pts = results
    else:    
        y_pts = [results[i]['expected'] for i in range(0, max_calls)]
    print(y_pts)
    plt.title(title)
    plt.plot(x_pts, y_pts, linewidth=2)
    plt.xlabel("Call Number")
    plt.ylabel("Success Per Call Rate")
    plt.ylim(0, 0.4)
    plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.3f}'))
#     plt.axvline(x=0, color ="black", linewidth=1)
#     plt.axhline(y=0, color ="black", linewidth=1)
    plt.xticks(np.arange(1, max_calls+1, 1))
#     plt.show()
    plt.savefig(str(name) + ".pdf")
    plt.close()
    

def div(a,b):
    if int(b) == 0:
        return 0.0
    else:
        return a/b
    

# Used for creating all possible combinations of the features.
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = iterable
    return itertools.chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


def convert(list): 
    return tuple(list) 


def construct_dict(feature_comb):
    new_dict = {}
    new_dict['education'] = convert(feature_comb[0])
    new_dict['job'] = convert(feature_comb[2])
    new_dict['marital'] = convert(feature_comb[1])
    new_dict['default'] = convert(feature_comb[3])
    new_dict['loan'] = convert(feature_comb[4])
    new_dict['housing'] = convert(feature_comb[5])
    return new_dict


# This is the new metric (success per call rate).
def compute_expected_succ_per_call_rate_feature_set(fs_df, no_calls_considered):
    expected_values_call_nums = []
    for i in range(1, no_calls_considered + 1):
        expected_values_call_nums.append({'succ':0, 'total_calls':0, 'expected':0.0})
        for index, row in fs_df.iterrows():
            no_calls = row['campaign']
            if no_calls <= i:
                if row['y'] == "yes":
                    expected_values_call_nums[i-1]['succ'] += 1
                expected_values_call_nums[i-1]['total_calls'] += no_calls
            else:
                expected_values_call_nums[i-1]['total_calls'] += i
    for loc, item in enumerate(expected_values_call_nums):
        expected_values_call_nums[loc]['expected'] = div(item['succ'], item['total_calls'])
    return expected_values_call_nums


def compute_optimal_call_no(results):
    max_loc = max(range(len(results)), key=lambda index: results[index]['expected'])
    if max_loc == 0 and results[max_loc]['expected'] == 0.0:
        return -1
    return max_loc


# Given a dictionary of what attributes comprise a feature set, we can get all rows corresponding to this feature set.
def extract_rows_feature_set(fs_df, feature_labels = {'education':['tertiary', 'unknown'], 
                                                      'job':['management', 'technician', 'blue-collar'], 
                                                      'marital':['single'], 'default':['no'], 
                                                      'housing':['no'], 'loan':['no']}):
    for key in feature_labels:
        feature_labels_query_str = ''
        arr = feature_labels[key]
        for label in arr:
            feature_labels_query_str += (key + ' == "'+ label + '" | ')
        feature_labels_query_str = feature_labels_query_str[:-3]
        fs_df = fs_df.query(feature_labels_query_str)
    return fs_df


def find_matching_attribute_comb(row_value, all_combs):
    query = None
    for comb in all_combs:
        for item in comb:
            if item == row_value:
                query = comb
    return query


def compute_metric(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += row['campaign']
    return div(total_successes, total_calls)

def compute_metric_2(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += min(row['campaign'], )
    return div(total_successes, total_calls)


def compute_metric_for_each_attribute(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    for index, value in enumerate(all_values):
        v_query = "{0} == '{1}'".format(attrib, value)
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals


def compute_metric_for_each_attribute_range(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    query_strings = []
    for index, value in enumerate(all_values):
        v_query = "{0} >= {1} & {2} < {3}".format(attrib, value[0], attrib, value[1])
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
        query_strings.append(v_query)
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals, query_strings


def find_combinations(sub_attributes, ratios):
    num_iter = len(ratios)
    sil_scores = []
    # Making use of the K-Means algorithm ... number of centroids are from 2 to n-1.
    for clust_num in range(2, num_iter):
        kmeans = KMeans(n_clusters = clust_num)
        kmeans.fit(ratios.reshape(-1,1))
        results = kmeans.labels_
        sil_scores.append((silhouette_score(ratios.reshape(-1,1), results, metric='euclidean'), results, clust_num))
#     print(sil_scores)
    # We make use of the silhouette score to determine the ideal number of centroids.
    sorted_sil_scores = sorted(sil_scores, key=lambda x: x[0], reverse = True)
    # We then use this ideal number of centroids to determine which sub attributes should be aggregated.
    joined_sub_attributes = []
    for i in range(0, sorted_sil_scores[0][2]):
        joined_sub_attributes.append([])
    join_list = sorted_sil_scores[0][1]
    for index, value in enumerate(join_list):
        pos = join_list[index]
        joined_sub_attributes[pos].append(sub_attributes[index])
    return_joined_sub_attributes = []
    for arr in joined_sub_attributes:
        similar_els_gp = []
        for item in arr:
            similar_els_gp.append(str(item))
        return_joined_sub_attributes.append(similar_els_gp)
#     print(return_joined_sub_attributes)
    return return_joined_sub_attributes

# The following is the format of the way in which this method should be called.
# find_combinations(['a', 'b', 'c', 'd'], np.array([1, 4, 7, 90]), "job").

def find_all_cust_feature_set(fs, df):
    comb = {
        'education':fs[0], 
         'job':fs[1], 
         'marital':fs[2], 
         'default':fs[3], 
         'loan':fs[4], 
         'housing':fs[5]
    }
    res_1 = df.query(fs[6])
    res_2 = res_1.query(fs[7])
    res_final = extract_rows_feature_set(res_2, comb)
    return res_final


def construct_hull_points(results, max_calls):
    pts = []
    for x in range(0, max_calls):
        s = results[x]['succ']
        c = results[x]['total_calls']
        pts.append([c,s])
#     print("Num points is ", len(pts))
    pts = np.array(pts)
    try:
        hull = ConvexHull(pts)
        verts = hull.vertices
#         print(pts)
#         plt.plot(pts[:,0], pts[:,1], 'o')
#         for simplex in hull.simplices:
#             plt.plot(pts[simplex, 0], pts[simplex, 1], 'k-')
        if not np.isin(max_calls - 1, verts):
            verts = np.append(max_calls - 1, verts)
        verts = np.sort(verts)
        return verts.tolist()
    except:
        return False

    
def gradient_update(key, fs_pick):
    fs = fs_pick[key]
    fs_results = fs['results']
    hull_pts = fs['hull_points']
    loc = fs['loc']
    max_loc = fs['max_num_pts']
    grad = 0.0
    if loc <= max_loc:
        if loc == 0:
            grad = div(fs_results[hull_pts[loc]]['succ'], fs_results[hull_pts[loc]]['total_calls'])
        else:
            grad = div(fs_results[hull_pts[loc]]['succ'] - fs_results[hull_pts[loc-1]]['succ'] , fs_results[hull_pts[loc]]['total_calls'] - fs_results[hull_pts[loc-1]]['total_calls'])
        fs_pick[key]['grad'] = grad
    else:
        fs_pick[key]['finished'] = True

        
def get_features(row, feature_names):
    fs = []
    for index, val in enumerate(feature_names):
        if int(row[index]) == 1:
            fs.append(val)
    return fs

In [3]:
# Helper functions for constructing the data required for each of the approaches (Baseline and Gradient Ascent) 
# to be executed successfully.

def construct_feature_combs(train_df):
    # At this point, we can run computations for the success rate of each sub attribute and join
    # the sub-attributes based on the output of k-means.
    poss = []

    # Education.
    all_ed = ['tertiary', 'secondary', 'primary', 'unknown']
    metric_vals = compute_metric_for_each_attribute(all_ed, train_df, 'education')
    education_cmbs = find_combinations(all_ed, metric_vals)

    # Occupation.
    all_jobs = ['student', 'retired', 'unemployed', 'admin.', 'management', 'self-employed', 'technician', 'unknown', 'services', 'housemaid', 'blue-collar', 'entrepreneur']
    metric_vals = compute_metric_for_each_attribute(all_jobs, train_df, 'job')
    job_cmbs = find_combinations(all_jobs, metric_vals)

    # Marital.
    all_ms = ['married', 'single', 'divorced']
    metric_vals = compute_metric_for_each_attribute(all_ms, train_df, 'marital')
    marital_cmbs = find_combinations(all_ms, metric_vals)

    # Default
    all_def = ['no', 'yes']
    default_cmbs = [['no'], ['yes']]

    # Loan
    all_ln = ['no', 'yes']
    loan_cmbs = [['no'], ['yes']]

    # Housing
    all_hs = ['no', 'yes']
    housing_cmbs = [['no'], ['yes']]

    poss.append(education_cmbs)
    poss.append(marital_cmbs)
    poss.append(job_cmbs)
    poss.append(default_cmbs)
    poss.append(loan_cmbs)
    poss.append(housing_cmbs)
    all_combs = list(itertools.product(*poss))

    # print("Number of combinations: ", len(all_combs)* len(age_query_strings) * len(balance_query_strings))

    # We can now go ahead and genreate the feature sets based on what was done previously.
    num_iter = 0
    combs_to_consider = {}
    fs_pick = {}

    # Setting up looping structures to generate all possibilities.
    for age_query in age_query_strings:
        df_filtered_final = train_df.query(age_query)
        for bal_query in balance_query_strings:
            df_filtered_final_2 = df_filtered_final.query(bal_query)
            for comb in all_combs:
                dict_final_query = construct_dict(comb)
                num_iter += 1
                extracted_df = extract_rows_feature_set(df_filtered_final_2, dict_final_query)
                key = (dict_final_query['education'], dict_final_query['job'], 
                       dict_final_query['marital'], dict_final_query['default'], 
                       dict_final_query['loan'], dict_final_query['housing'], 
                       bal_query, age_query)
                n_rows = extracted_df.shape[0]
                if n_rows >= cp:
                    results = compute_expected_succ_per_call_rate_feature_set(extracted_df, max_calls)
#                             max_loc = compute_optimal_call_no(results)
#                             if max_loc != -1:
                    combs_to_consider[key] = {
                                                'max_loc':0,
                                                'best_rate':0, 
                                                'overall_rate':results[max_calls-1]['expected'], 
                                                'n_rows':n_rows, 
                                                'results':results,
                                                'fs_customers':None
                                             }
                    fs_pick[key] = {'grad': 0.0, 
                                    'loc':0, 
                                    'finished':False, 
                                    'hull_points':None,
                                    'max_num_pts': -1,
                                    'results':None,
                                    'fs_customers' :None
                                   }
#               else:
#                   print("Invalid FS ! -> ", n_rows)
    for fs_key in combs_to_consider.keys():
        fs_customers = find_all_cust_feature_set(fs_key, test_df)
        combs_to_consider[fs_key]['fs_customers'] = fs_customers
        res = construct_hull_points(combs_to_consider[fs_key]['results'], max_calls)
        fs_pick[fs_key]['results'] = combs_to_consider[fs_key]['results']
        fs_pick[fs_key]['fs_customers'] = fs_customers
        if res is False:
            print("Invalid Convex Hull Assignment")
            fs_pick[fs_key]['finished'] = True
        else:
            fs_pick[fs_key]['hull_points'] = res
            fs_pick[fs_key]['max_num_pts'] = len(res) - 1
    return combs_to_consider, fs_pick


def group_age(row, age_ranges):
#     print(age_ranges)
    age = int(row['age'])
    age_val = None
    for index, age_range in enumerate(age_ranges):
        if op.ge(age, age_range[0]) and op.le(age, age_range[1]):
            age_val = index + 1
    if age_val == None:
        print("Failed Assignment for age: ", age)
#         mkt_df_filtered_kmeans.loc[loc, 'age'] = age_val
    return age_val
        

def group_balance(row, balance_ranges):
#     print(balance_ranges)
    bal = int(row['balance'])
    bal_val = None
    for index, balance_range in enumerate(balance_ranges):
        if op.ge(bal, balance_range[0]) and op.le(bal, balance_range[1]):
            bal_val = index + 1
    if bal_val == None:
        print("Failed Assignment for balance: ", bal)
#         mkt_df_filtered_kmeans.loc[loc, 'balance'] = bal_val
    return bal_val

In [4]:
# Functions representing each approach taken.

def call_everyone(test_df):
    print("Call all Customers Approach")
    call_check_points = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, 23500, 24000, 24500, 25000, 25500, 26000, 26500, 27000, 27500, 28000, 28500, 29000, 29500, 30000, 30500, 31000, 31500, 32000, 32500, 33000, 33500, 34000, 34500, 35000, 35500, 36000, 36500, 37000, 37500, 38000, 38500, 39000, 39500, 40000, 40500, 41000, 41500, 42000, 42500, 43000, 43500, 44000, 44500, 45000, 45500, 46000, 46500, 47000, 47500, 48000, 48500, 49000, 49500, 50000, 50500, 51000, 51500, 52000, 52500, 53000, 53500, 54000, 54500, 55000, 55500, 56000, 56500, 57000, 57500, 58000, 58500, 59000, 59500, 60000, 60500, 61000, 61500, 62000, 62500, 63000, 63500, 64000, 64500, 65000, 65500, 66000, 66500, 67000, 67500, 68000, 68500, 69000, 69500, 70000, 70500, 71000, 71500, 72000, 72500, 73000, 73500, 74000, 74500, 75000, 75500, 76000, 76500, 77000, 77500, 78000, 78500, 79000, 79500, 80000, 80500, 81000, 81500, 82000, 82500, 83000, 83500, 84000, 84500, 85000, 85500, 86000, 86500, 87000, 87500, 88000, 88500, 89000, 89500, 90000, 90500, 91000, 91500, 92000, 92500, 93000, 93500, 94000, 94500, 95000, 95500, 96000, 96500, 97000, 97500, 98000, 98500, 99000, 99500, 100000, 100500, 101000, 101500, 102000, 102500, 103000, 103500, 104000, 104500, 105000, 105500, 106000, 106500, 107000, 107500, 108000, 108500, 109000, 109500, 110000, 110500, 111000, 111500, 112000, 112500, 113000, 113500, 114000, 114500, 115000, 115500, 116000, 116500, 117000, 117500, 118000, 118500, 119000, 119500, 120000, 120500, 121000, 121500, 122000, 122500, 123000, 123500, 124000, 124500, 125000, 125500, 126000, 126500, 127000, 127500, 128000, 128500, 129000, 129500, 130000, 130500, 131000, 131500, 132000, 132500, 133000, 133500, 134000, 134500, 135000, 135500, 136000, 136500, 137000, 137500, 138000, 138500, 139000, 139500]
    result_ratios_p1 = []
    cp_loc = 0
    num_succ = 0
    num_calls = 0
    res = test_df.reindex(np.random.permutation(test_df.index))
    for loc, row in res.iterrows():
        if num_calls >= call_check_points[cp_loc]:
            cp_loc += 1
            result_ratios_p1.append((num_succ, num_calls))
        num_calls += row['campaign']
        if row['y'] == "yes":
            num_succ += 1
    result_ratios_p1.append((num_succ, num_calls))
    return result_ratios_p1, num_calls


def greedy_approach(combs_to_consider):
    print("Greedy Approach")
    persons_to_call_overall = {k: v for k, v in sorted(combs_to_consider.items(), key=lambda fs: fs[1]['overall_rate'], reverse = True)}
    num_succ = 0
    num_calls = 0
    result_ratios_p2 = []
    # print(type(persons_to_call_overall))
    result_ratios = []
    for key in persons_to_call_overall.keys():
        for loc, cust in persons_to_call_overall[key]['fs_customers'].iterrows():
            num_calls += cust['campaign']
            if cust['y'] == "yes":
                num_succ +=1
        result_ratios_p2.append((num_succ, num_calls))
    return result_ratios_p2


def convex_hull(fs_pick, num_calls):
    print("Gradient Ascent Approach")
    result_ratios_p4 = []
    total_s = 0
    total_c = 0
    print("Performing initial update .. ")
    for key in fs_pick.keys():
        gradient_update(key, fs_pick)
    print("Perforiming sort .. ")
    # Sort based on gradient.
    optimal_choices = [(k,v) for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['grad'], reverse = True)]
    # Call best feature set, update gradient for this feature set and re-sort all feature sets.
    # Rinse and repeat!
    print("Finished sort .. in while loop ")
    while(total_c <= num_calls):
        best_loc = 0
        while(best_loc < len(optimal_choices) and optimal_choices[best_loc][1]['finished'] == True):
            best_loc += 1
        if best_loc == len(optimal_choices):
            break
        fs_key = optimal_choices[best_loc][0]
        fs_data = optimal_choices[best_loc][1]
        if fs_data['finished'] == False:
            loc = fs_data['loc']
            if loc == 0:
                call_start = 1
                call_end = fs_data['hull_points'][loc] + 1
            else:
                call_start = fs_data['hull_points'][loc-1] + 2
                call_end = fs_data['hull_points'][loc] + 1
            for call in range(call_start, call_end + 1, 1):
                for loc, row in fs_pick[fs_key]['fs_customers'].iterrows():
                    if row['campaign'] == call:
                        total_c += 1
                        if row['y'] == "yes":
                            total_s += 1
                    elif row['campaign'] > call:
                        total_c += 1
            result_ratios_p4.append((total_s, total_c))
            fs_pick[fs_key]['loc'] += 1
            gradient_update(fs_key, fs_pick)
            optimal_choices = [(k,v) for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['grad'], reverse = True)]
    return result_ratios_p4


def upper_bound(test_df):
    print("Upper Bound Approach")
    num_succ = 0
    num_calls = 0
    result_ratios_p5 = []
    res_df = test_df.query("y == 'yes'")
    for x in range(1, max_calls + 1):
        res_df2 = res_df.query("campaign == {0}".format(x))
        num_cust = len(res_df2)
        num_succ += num_cust
        num_calls = num_calls + (num_cust * x)
        result_ratios_p5.append((num_succ, num_calls))
    res_df = test_df.query("y == 'no'")
    for x in range(1, max_calls + 1):
        res_df2 = res_df.query("campaign == {0}".format(x))
        num_cust = len(res_df2)
        num_calls = num_calls + (num_cust * x)
        result_ratios_p5.append((num_succ, num_calls))
    return result_ratios_p5

    
def compute_ratio_all_users(df, train_indicies):
    ratio_values = []
#     for loc, row in df.iterrows():
#         if row['y'] == "yes":
#             ratio_values.append((loc, div(1, row['campaign'])))
#         else:
#             ratio_values.append((loc, 0.0))
    for val in train_indicies:
        row = df.iloc[val]
        if row['y'] == "yes":
            ratio_values.append((val, div(1, row['campaign'])))
        else:
            ratio_values.append((val, 0.0))
    return ratio_values


def compute_freq_percentage(mappings):
    total = 0
    for user_mapping in mappings.keys():
        total += mappings[user_mapping]['freq']
    for user_mapping in mappings.keys():
        mappings[user_mapping]['percentage'] = div(mappings[user_mapping]['freq'], total)
    
    
def new_approach_ratio_grouping(mkt_df_filtered, train_indicies, test_indicies, group_size, age_groupings, balance_groupings):
    # Ensuring that we can binary encode any row in our dataset. We also group age and balance values 
    # from each row into ranges.
    print("Step 1")
    for loc, row in mkt_df_filtered.iterrows():
        mkt_df_filtered.loc[loc, 'age'] = group_age(row, age_groupings)
        mkt_df_filtered.loc[loc, 'balance'] = group_balance(row, balance_groupings)
    # Build the customers for each group.
    print("Step 2")
    ratio_arr = compute_ratio_all_users(mkt_df_filtered, train_indicies)
    ratio_arr_sorted = sorted(ratio_arr, key=lambda tup: tup[1], reverse = True)
    train_size = len(ratio_arr_sorted)
    groupings = {}
    for loc in range(0, train_size):
        group_key = str(int(loc/group_size))
        if group_key not in groupings.keys():
            groupings[group_key] = {'indicies':[], 'mappings':{}, 'results':None} 
        groupings[group_key]['indicies'].append(ratio_arr_sorted[loc][0])
    print(len(groupings.keys()))
    # For each group, we find the unique feature combinations and store them in a list. 
    # We also store the results - s/c ratio for call numbers from 1-20.
    test_calls = {}
    print("Step 3")
    for group_key in groupings.keys():
        users_df = mkt_df_filtered.iloc[groupings[group_key]['indicies']]
        mappings = groupings[group_key]['mappings']
        for row in users_df.itertuples():
            user_mapping = str((row.job, row.marital, row.education, row.default,
                               row.housing, row.loan, row.age, row.balance))
            if user_mapping not in mappings.keys():
                mappings[user_mapping] = {'freq':0, 'percentage':0.0}
            else:
                mappings[user_mapping]['freq'] += 1
        groupings[group_key]['results'] = compute_expected_succ_per_call_rate_feature_set(users_df, 20)
        compute_freq_percentage(mappings)
        test_calls[group_key] = {'locs_to_call':[], 'overall_rate':groupings[group_key]['results'][19]}
    # For the test set, we need to map each user to the most appropriate cluster.
    print("Step 4")
    for loc in test_indicies:
        row = mkt_df_filtered.iloc[loc]
        user_mapping = str((row['job'], row['marital'], row['education'], row['default'],
                            row['housing'], row['loan'], row['age'], row['balance']))
        best_group_key = None
        best_ratio = 0.0
        for group_key in groupings.keys():
            if user_mapping in groupings[group_key]['mappings']:
                if groupings[best_group_key]['mappings'][user_mapping]['percentage'] > best_ratio:
                    best_group_key = group_key
                    best_ratio = groupings[group_key]['mappings'][user_mapping]['percentage']
        test_calls[best_group_key]['locs_to_call'].append(loc)
    # Call users ... those with the highest ratios are called first.
    test_calls_sorted = {k: v for k, v in sorted(test_calls.items(), key=lambda fs: fs[1]['overall_rate'], reverse = True)}
    print("Step 5")
    num_succ = 0
    num_calls = 0
    result_ratios = []
    for key in test_calls_sorted.keys():
        for cust_loc in test_calls_sorted[key]['locs_to_call']:
            row = mkt_df_filtered.loc[cust_loc]
            if row['y'] == yes:
                num_succ += 1
            num_calls += row['campaign']
        result_ratios.append((num_succ, num_calls))
    
    return result_ratios, test_calls, groupings
        
        
def clustering_age_balance_grouped_no_ratio(train_df, test_df, min_cluster_size, balance_groupings, age_groupings):
    print("In P7")
    group_age(train_df, age_groupings)
    group_balance(train_df, balance_groupings)
    encoder = OneHotEncoder()
    train_df_encoded = encoder.fit_transform(train_df).toarray()
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    clusterer.fit(train_df_encoded)
    predictions = clusterer.labels_
    group_age(test_df, age_groupings)
    group_balance(test_df, balance_groupings)
    test_df_encoded = encoder.fit_transform(test_df.drop(columns=['y', 'campaign'])).toarray()
    feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
#     return None, None, (None, None)
    return abstraction_new_approach(predictions, feature_names, train_df, test_df, train_df_encoded, test_df_encoded)


def clustering_age_balance_grouped_ratio_gradient_ascent(train_df, test_df, min_cluster_size, balance_groupings, age_groupings, num_calls):
    print("In P8")
    group_age(train_df, age_groupings)
    group_balance(train_df, balance_groupings)
    train_df, ratio_df = compute_ratio_all_users(train_df)
    encoder = OneHotEncoder()
    train_df_encoded = encoder.fit_transform(train_df.drop(columns=['y', 'campaign'])).toarray()
    train_df_encoded = np.column_stack((train_df_encoded, ratio_df['ratio'].to_numpy()))
    feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    clusterer.fit(train_df_encoded)
    predictions = clusterer.labels_
    test_df = mkt_df_filtered.iloc[test_index]
    group_age(test_df, age_groupings)
    group_balance(test_df, balance_groupings)
    test_df_encoded = encoder.fit_transform(test_df.drop(columns=['y', 'campaign'])).toarray()
#     return None, None, (None, None)
    return abstraction_new_approach_gradient_ascent(predictions, feature_names, train_df, test_df, train_df_encoded, test_df_encoded, num_calls)

In [8]:
%%time
# Code that sets up values to construct all possible feature combinations.

# Age query strings.
# age_query_strings = ['age < 26','age >= 26 & age <=60','age >60']
# age_query_strings = ['age >= 10 & age <= 32', 'age >= 33 & age <= 40', 'age >= 50 & age <= 59', 'age >= 60']
age_query_strings = ['age >= 10 & age <= 34', 'age >= 35 & age <= 45', 'age >= 46']

# Balance query strings.
balance_query_strings = ['balance <= 450',' balance > 450']

# Balance query config.
balance_ranges_1 = [(-10000, -1), (0, 550), (551, 105000)]
balance_ranges_2 = [(-10000, 450), (451, 105000)]

# Age query config.
age_ranges_1 = [(18,30), (31,40), (41,50), (51,60), (61,100)]
age_ranges_2 = [(10, 34), (35, 45), (46, 100)]

# Max call number to consider.
max_calls = 20

# Pull and filter all calls <= 20.
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls)]
mkt_df_filtered = mkt_df_filtered[['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance', 'campaign', 'y']]
mkt_df_filtered_kmeans = mkt_df_filtered[['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance']]
print(mkt_df.shape)

(45211, 17)
CPU times: user 41.9 ms, sys: 3.03 ms, total: 44.9 ms
Wall time: 44.2 ms


In [5]:
%%time

# Main code ... orchestrates everything!

# Splitting dataframe into data and result dataframes.
X = mkt_df_filtered.iloc[:,0:len(mkt_df_filtered.columns)-1]
y = mkt_df_filtered.iloc[:,-1]

# cut_points = [10, 20]

cut_points = [20]

for cp in cut_points:

    for j in range(1,2):
        
        phase_batch = {}
        kf = KFold(n_splits=5, shuffle=True)
        i = 0
        for train_index, test_index in kf.split(X):
            i += 1
            
            print("At fold number: ", i)

            train_df = mkt_df_filtered.iloc[train_index]
            test_df = mkt_df_filtered.iloc[test_index]
            
#             combs_to_consider, fs_pick = construct_feature_combs(train_df)
            
            
#             # Testing Phase 1 -> Baseline test with shuffling of all customers and calling them ..
#             result_ratios_p1, num_calls = call_everyone(test_df)


#             # Testing Phase 2 -> Order how we call customers - based on the overall s/c ratio ..
#             result_ratios_p2 = greedy_approach(combs_to_consider)


#             # Testing Phase 4 -> Convex Hull - Gradient Ascent Approach ..
#             result_ratios_p4 = convex_hull(fs_pick, num_calls)
            
            
#             # Testing Phase 5 -> If we were godlike and knew all ..
#             result_ratios_p5 = upper_bound(test_df)
            print(len(train_index))
            a,b,c = new_approach_ratio_grouping(mkt_df_filtered, train_index, test_index, 500, age_ranges_2, balance_ranges_2)
            
            break

            # Add all results together for this fold.
            
            phase_batch_key = str(j) + "_" + str(i)
            phase_batch[phase_batch_key] = {'p1':result_ratios_p1, 'p2':result_ratios_p2, 
                                            'p4':result_ratios_p4, 'p5':result_ratios_p5,
                                            'p6':result_ratios_p6, 'p7':result_ratios_p7, 
                                            'p8':result_ratios_p8, 'p9':None}
            
            
# 'all_marital_unknown_res_cp_' + str(cp) +'.json'

        with open('ALL_FIX_Ratio_No_RatioGA' + str(cp) +'.json', 'w') as fp:
                json.dump(phase_batch, fp)

NameError: name 'mkt_df_filtered' is not defined

In [None]:
enc = OneHotEncoder()
enc.fit(train_df.drop(columns=['y', 'campaign', 'age', 'balance']))
res1 = enc.transform([test_df.drop(columns=['y', 'campaign', 'age', 'balance']).iloc[1]]).toarray()
res2 = enc.transform([test_df.drop(columns=['y', 'campaign', 'age', 'balance']).iloc[20]]).toarray()
# all_rows_enc = encoder.fit_transform(train_df.drop(columns=['y', 'campaign', 'age', 'balance'])).toarray()

In [None]:
num_rows = len(mkt_df_filtered)
for i in range(0, num_rows):
    row = mkt_df_filtered.iloc[i]
    mkt_df_filtered.iloc[i, 6] = group_age(row, age_ranges_2)
    mkt_df_filtered.iloc[i, 7] = group_balance(row, balance_ranges_2)

In [None]:
print(len(test_index))
for loc in test_index:
    row = mkt_df_filtered.iloc[loc]

In [None]:
for row in mkt_df_filtered.itertuples():
    y = mkt_df_filtered.loc[row.Index]

In [None]:
for loc, row in mkt_df_filtered.iterrows():
    y = row
    x = loc
print(y)

In [None]:
mkt_df_filtered.loc[45210]

In [None]:
print(x)

In [None]:
s="((1, u'Central Plant 1', u'http://egauge.com/'), (2, u'Central Plant 2', u'http://egauge2.com/'))"
eval(s)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
results = euclidean_distances(res1, res2)

In [None]:
results[0][0]

In [None]:
train_df.columns

In [None]:
# USE THIS BODY OF CODE TO VERIFY RESULTS ... A KFOLD OBJECT MUST EXIST ALREADY.

%%time

# Main code ... orchestrates everything!

# Splitting dataframe into data and result dataframes.
X = mkt_df_filtered.iloc[:,0:len(mkt_df_filtered.columns)-1]
y = mkt_df_filtered.iloc[:,-1]

# cut_points = [10, 20]

cut_points = [20]

for cp in cut_points:

    for j in range(1,2):
        
        phase_batch = {}
        i = 0
        for train_index, test_index in kf.split(X):
            i += 1
            
            print("At fold number: ", i)

            train_df = mkt_df_filtered.iloc[train_index]
            test_df = mkt_df_filtered.iloc[test_index]
            
            combs_to_consider, fs_pick = construct_feature_combs(train_df)
            
            
            # Testing Phase 1 -> Baseline test with shuffling of all customers and calling them ..
            result_ratios_p1, num_calls = call_everyone(test_df)


            # Testing Phase 2 -> Order how we call customers - based on the overall s/c ratio ..
            result_ratios_p2 = greedy_approach(combs_to_consider)


            # Testing Phase 4 -> Convex Hull - Gradient Ascent Approach ..
            result_ratios_p4 = convex_hull(fs_pick, num_calls)
            
            
            # Testing Phase 5 -> If we were godlike and knew all ..
            result_ratios_p5 = upper_bound(test_df)
            
            
            # --- Preparing Data For Clustering ---
            # We run any type of clustering and then pass the predictions to the clustering function
            # which will compute the results.
#             construct_data_clustering(mkt_df_filtered_kmeans)
            
#             train_df = mkt_df_filtered_kmeans.iloc[train_index]
#             test_df = mkt_df_filtered_kmeans.iloc[test_index]

#             encoder = OneHotEncoder()
#             mkt_df_filtered_kmeans_encoded = encoder.fit_transform(mkt_df_filtered_kmeans).toarray()

#             feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
            
#             # Testing Phase 6 -> KMeans Clustering
            
#             num_centroids = 450
#             km = KMeans(n_clusters=num_centroids, max_iter = 300)
#             fit_params = km.fit(mkt_df_filtered_kmeans_encoded)
#             predictions = km.predict(mkt_df_filtered_kmeans_encoded)
            
#             result_ratios_p6 = clustering(predictions)
            
            
#             # Testing Phase 6 -> KMeans Clustering with Gradient Ascent
#             result_ratios_p65 = clustering_gradient_ascent(predictions, num_calls)
            
#             # Testing Phase 7 -> HDBScan
#             clusterer = hdbscan.HDBSCAN(min_cluster_size=50)
#             clusterer.fit(mkt_df_filtered_kmeans_encoded)
#             predictions = clusterer.labels_
            
#             result_ratios_p7 = clustering(predictions)
            
#             # Testing Phase 8 -> HDBScan with Gradient Ascent
#             result_ratios_p8 = clustering_gradient_ascent(predictions, num_calls)
            
#             # Testing Phase 9 -> HDBScan Corrected ...
#             construct_data_clustering_ratio(train_df)
#             encoder = OneHotEncoder()
#             train_df_encoded = encoder.fit_transform(train_df).toarray()
#             clusterer = hdbscan.HDBSCAN(min_cluster_size=20)
#             clusterer.fit(train_df_encoded)
#             predictions = clusterer.labels_
#             feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])
              
            # Testing Phase 10 - New Approach
            print("P9")
            train_df = mkt_df_filtered.iloc[train_index]
            test_df = mkt_df_filtered.iloc[test_index]
            result_ratios_p9, groupings, hits = new_approach_train_set_cluster_only(train_df, test_df)
            
            
            print("P10")
            train_df = mkt_df_filtered.iloc[train_index]
            test_df = mkt_df_filtered.iloc[test_index]
            result_ratios_p10, groupings2, hits2 = new_approach_train_set_cluster_only_2(train_df, test_df)

#             result_ratios_p1 = None
#             result_ratios_p2 = None
#             result_ratios_p3 = None
#             result_ratios_p4 = None
#             result_ratios_p5 = None
            result_ratios_p6 = None
            result_ratios_p65 = None
            result_ratios_p7 = None
            result_ratios_p8 = None
            # Add all results together for this fold.
            
            phase_batch_key = str(j) + "_" + str(i)
            phase_batch[phase_batch_key] = {'p1':result_ratios_p1, 'p2':result_ratios_p2, 
                                            'p4':result_ratios_p4, 'p5':result_ratios_p5,
                                            'p6':result_ratios_p6, 'p65':result_ratios_p65,
                                            'p7':result_ratios_p7, 'p8':result_ratios_p8,
                                            'p9':result_ratios_p9, 'p10':result_ratios_p10}
            
            
# 'all_marital_unknown_res_cp_' + str(cp) +'.json'

        with open('DB_Trial_Ratio_No_Ratio_NEW_VALIDATE' + str(cp) +'.json', 'w') as fp:
                json.dump(phase_batch, fp)

In [None]:
print(hits2)

In [None]:
print(type(result_ratios_p10[0][1]))

In [None]:
# Piecing together now approach .... 

In [None]:
train_df = mkt_df_filtered.iloc[train_index]
test_df = mkt_df_filtered.iloc[test_index]
print(train_df.columns)
train_df, ratio_df = construct_data_clustering_ratio(train_df)
print(train_df.columns)
encoder = OneHotEncoder()
train_df_encoded = encoder.fit_transform(train_df.drop(columns=['y', 'campaign', 'age', 'balance'])).toarray()
train_df_encoded = np.column_stack((train_df_encoded, ratio_df['ratio'].to_numpy()))
# clusterer = hdbscan.HDBSCAN(min_cluster_size=20)
# clusterer.fit(train_df_encoded)
# predictions = clusterer.labels_
# feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=25)
clusterer.fit(train_df_encoded)
predictions = clusterer.labels_

In [None]:
groupings = {}
for index, group in enumerate(predictions):
    if str(group) not in groupings.keys():
        groupings[str(group)] = {'indicies':[], 'unique_keys':{}, 'results':None}
    groupings[str(group)]['indicies'].append(index)

In [None]:
feature_names = encoder.get_feature_names(['job', 'marital', 'education', 'default', 'housing', 'loan', 'age', 'balance'])

In [None]:
for group in groupings.keys():
    for index in groupings[group]['indicies']:
        cust_info = train_df.iloc[index]
        cust_features = get_features(train_df_encoded[index], feature_names)
        if str(cust_features) not in groupings[group]['unique_keys'].keys():
            groupings[group]['unique_keys'][str(cust_features)] = {'#_ocurr': 1}
        else:
            groupings[group]['unique_keys'][str(cust_features)]['#_ocurr'] += 1
    results = compute_expected_succ_per_call_rate_feature_set(train_df.iloc[groupings[group]['indicies']], 20)
    groupings[group]['results'] = results
    print(len(groupings[group]['indicies']))

In [None]:
test_df = mkt_df_filtered.iloc[test_index]
construct_data_clustering(test_df)
test_df_encoded = encoder.fit_transform(test_df.drop(columns=['y', 'campaign'])).toarray()

In [None]:
cust_features = get_features(test_df_encoded[100], feature_names)

In [None]:
cust_features

In [None]:
cust_features

In [None]:
from collections import Counter

final_call_arr = []
counts = []

for index in range(0, 8993, 1):
    cust_features = get_features(test_df_encoded[index], feature_names)
    hits = 0
    key_hits_refs = []
    for group in groupings.keys():
        if str(cust_features) in groupings[group]['unique_keys'].keys():
            hits += 1
            key_hits_refs.append((groupings[group]['results'][19]['expected'], str(cust_features)))
    if hits == 1:
        final_call_arr.append((key_hits_refs[0][0], key_hits_refs[0][1], index))
    elif hits >1:
        # We need to sort key_hits_refs first in order of highest ratio ..
        sorted_key_hits = sorted(key_hits_refs, key=lambda tup: tup[0], reverse = True)
        final_call_arr.append((key_hits_refs[0][0], key_hits_refs[0][1], index))
    counts.append(hits)

sorted_final_call = sorted(final_call_arr, key=lambda tup: tup[0], reverse = True)

total_s = 0
total_c = 0
curr_key = None
vals = []
prev_key = sorted_final_call[0][1]
for rec in sorted_final_call:
    index = rec[2]
    row = test_df.iloc[index]
    if row['y'] == "yes":
        total_s += 1
    total_c += row['campaign']
    if rec[1] != prev_key:
        vals.append((total_s, total_c))
        prev_key = rec[1]

print(Counter(counts).keys()) # equals to list(set(words))
print(Counter(counts).values()) # counts the elements' frequency

In [None]:
sorted_final_call = sorted(final_call_arr, key=lambda tup: tup[0], reverse = True)

In [None]:
total_s = 0
total_c = 0
curr_key = None
vals = []
prev_key = sorted_final_call[0][1]
for rec in sorted_final_call:
    index = rec[2]
    row = test_df.iloc[index]
    if row['y'] == "yes":
        total_s += 1
    total_c += row['campaign']
    if rec[1] != prev_key:
        vals.append((total_s, total_c))
        prev_key = rec[1]

In [None]:
for i in range(100, 3900, 100):
    print(vals[i][0]/vals[i][1])

In [None]:
vals

In [None]:
print("PHASE 4")
result_ratios_p4 = []
total_s = 0
total_c = 0

for key in fs_pick.keys():
    gradient_update(key, fs_pick, combs_to_consider)
# Sort based on gradient.
optimal_choices = [(k,v) for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['grad'], reverse = True)]
# Call best feature set, update gradient for this feature set and re-sort all feature sets.
# Rinse and repeat!
while(total_c <= num_calls - 10):
    best_loc = 0
    while(best_loc < len(optimal_choices) and optimal_choices[best_loc][1]['finished'] == True):
        best_loc += 1
    if best_loc == len(optimal_choices):
        break
    fs_key = optimal_choices[best_loc][0]
    fs_data = optimal_choices[best_loc][1]
    if fs_data['finished'] == False:
        loc = fs_data['loc']
        if loc == 0:
            call_start = 1
            call_end = fs_data['hull_points'][loc] + 1
        else:
            call_start = fs_data['hull_points'][loc-1] + 2
            call_end = fs_data['hull_points'][loc] + 1
        for call in range(call_start, call_end + 1, 1):
            for loc, row in combs_to_consider[fs_key]['fs_customers'].iterrows():
                if row['campaign'] == call:
                    total_c += 1
                    if row['y'] == "yes":
                        total_s += 1
                elif row['campaign'] > call:
                    total_c += 1
        result_ratios_p4.append((total_s, total_c))
        fs_pick[fs_key]['loc'] += 1
        gradient_update(fs_key, fs_pick, combs_to_consider)
        optimal_choices = [(k,v) for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['grad'], reverse = True)]

In [None]:
mkt_df_filtered['default'].unique()

In [None]:
persons_to_call_overall = {k: v for k, v in sorted(combs_to_consider.items(), key=lambda fs: fs[1]['overall_rate'], reverse = True)}

In [None]:
key_indicies = [0, 9, 24, 49, 74 , 99]
i = 0
for key in persons_to_call_overall.keys():
    print(i, key[3])
#     if i in key_indicies:
#         print(i, persons_to_call_overall[key]['overall_rate'], key[3]) 
#         print("\n")
    i += 1

In [None]:
### i = 0
for key in persons_to_call_overall.keys():
    print(i)
    if i >= 113 and i<=115:
#         print(key[3], key[4], key[5])
        print(persons_to_call_overall[key]['results'][19])
        plot_graph_new(persons_to_call_overall[key]['results'], 20, False, "Customer Segment " + str(i+1), "worst_"+str(i))     
    i += 1

In [None]:
plot_graph_new(persons_to_call_overall[key]['results'], 20, False, "Customer Segments 116 - 118  ", "worst_agg")     

In [None]:
i = 0
for key in persons_to_call_overall.keys():
    print(persons_to_call_overall[key]['results'][19])
    print(key[3], key[4], key[5])
    if i > 2:
        break
    plot_graph_new(persons_to_call_overall[key]['results'], 20, False, "Customer Segment " + str(i+1), "best_"+str(i))
    i += 1

In [None]:
persons_to_call_overall

In [None]:
for k in persons_to_call_overall:
    if persons_to_call_overall[k]['n_rows'] == 956:
        thi_fs = k
        for res in persons_to_call_overall[k]['results']:
            print(res)
        plot_graph_new(persons_to_call_overall[k]['results'], 20, False, "")

In [None]:
res = construct_hull_points(combs_to_consider[thi_fs]['results'], max_calls)

In [None]:
thi_fs

In [None]:
res

In [None]:
all_ed = ['primary', 'secondary', 'tertiary', 'unknown']
metric_vals = compute_metric_for_each_attribute(all_ed, train_df, 'education')
print(metric_vals)
print(len(all_combs)* len(age_query_strings) * len(balance_query_strings))

In [None]:
print("Phase 5")
num_succ = 0
num_calls = 0

result_ratios_p5 = []
res_df = test_df.query("y == 'yes'")
print(len(res_df))
for i in range(1, 21):
    res_df2 = res_df.query("campaign == {0}".format(i))
    num_cust = len(res_df2)
    num_succ += num_cust
    num_calls = num_calls + (num_cust * i)
    result_ratios_p5.append((num_succ, num_calls))

res_df = test_df.query("y == 'no'")
print(len(res_df))
for i in range(1, 21):
    res_df2 = res_df.query("campaign == {0}".format(i))
    num_cust = len(res_df2)
    num_calls = num_calls + (num_cust * i)
    result_ratios_p5.append((num_succ, num_calls))
print(result_ratios_p5)

In [None]:
job_cmbs

In [None]:
optimal_choices

In [None]:
result_ratios_p2

In [None]:
num_calls

In [None]:
for ch in optimal_choices:
    print(optimal_choices[ch]['grad'])

In [None]:
housing_cmbs

In [None]:
key = (('secondary', 'primary'),
  ('student', 'retired'),
  ('married', 'single', 'divorced'),
  ('no',),
  ('no', 'yes'),
  ('no',),
  'balance <= 450',
  'age >= 10 & age <= 34')
fs_pick[key]['loc'] = 0
fs_pick[key]['finished'] = False
print(fs_pick[key]['hull_points'])
total_s = 0
total_c = 0


In [None]:
fs = fs_pick[key]
fs_results = combs_to_consider[key]['results']
hull_pts = fs['hull_points']
loc = fs['loc']
max_loc = fs['max_num_pts']
grad = 0.0
if loc <= max_loc:
    if loc == 0:
        grad = div(fs_results[hull_pts[loc]]['succ'], fs_results[hull_pts[loc]]['total_calls'])
    else:
        grad = div(fs_results[hull_pts[loc]]['succ'] - fs_results[hull_pts[loc-1]]['succ'] , fs_results[hull_pts[loc]]['total_calls'] - fs_results[hull_pts[loc-1]]['total_calls'])
    fs_pick[key]['grad'] = grad
else:
    fs_pick[key]['finished'] = True
print("At Loc:", loc)
print(grad)


fs = fs_pick[key]
if fs['finished'] == False:
    loc = fs['loc']
    if loc == 0:
        call_start = 1
        call_end = fs['hull_points'][loc] + 1
    else:
        call_start = fs['hull_points'][loc-1] + 2
        call_end = fs['hull_points'][loc] + 1
    print("Start and end:", call_start, call_end)
    for call in range(call_start, call_end + 1, 1):
        for loc, row in combs_to_consider[key]['fs_customers'].iterrows():
            if row['campaign'] == call:
                total_c += 1
                if row['y'] == "yes":
                    total_s += 1
            elif row['campaign'] > call:
                total_c += 1
    fs_pick[key]['loc'] += 1
print(total_s, total_c)

In [None]:
a = (('tertiary', 'unknown'), ('student', 'retired'), ('married', 'single', 'divorced'), ('no', 'yes'), ('no', 'yes'), ('yes', 'unknown'), 'balance <= 450', 'age >= 10 & age <= 34')
del combs_to_consider[a]

In [None]:
# key = (('secondary', 'primary'), ('unemployed', 'admin.', 'management', 'self-employed', 'technician', 'unknown', 'services', 'housemaid', 'blue-collar', 'entrepreneur'), ('married', 'single', 'divorced'), ('no',), ('no', 'yes'), ('yes', 'unknown'), 'balance <= 450', 'age >= 35 & age <= 45')
key = (('secondary', 'primary'),
  ('student', 'retired'),
  ('married', 'single', 'divorced'),
  ('no',),
  ('no', 'yes'),
  ('no',),
  'balance <= 450',
  'age >= 10 & age <= 34')
for index, row in combs_to_consider[key]['fs_customers'].iterrows():
    print(row['y'], row['campaign'])
print(fs_pick[key])
plot_graph_new(combs_to_consider[key]['results'], max_calls, False, "Expected Ratio per Call")
print(combs_to_consider[key]['results'])
res = construct_hull_points(combs_to_consider[fs_key]['results'])

In [None]:
num_s = 0
num_c = 0
for i, row in test_df.iterrows():
    if row['y'] == "yes":
        num_s += 1
    num_c += row['campaign']
print(num_s, num_c)

In [None]:
num_s = 0
num_c = 0
for key in combs_to_consider.keys():
    for i, row in combs_to_consider[key]['fs_customers'].iterrows():
        if row['y'] == "yes":
            num_s += 1
        num_c += row['campaign']
print(num_s, num_c)

# The cells that follow are used for checking/fine tuning purposes.

In [None]:
age = 45
for age_q in age_query_strings:
    if eval(age_q):
        print("Yes!")
        print(age_q)

## Age and balance computation.

In [None]:
# Age.
# all_age_query_strings = ['age >= 10 & age <= 32', 'age >= 33 & age <= 40', 'age >= 50 & age <= 59', 'age >= 60']
# all_age_query_strings = ['age >= 10 & age <= 34', 'age >= 35 & age <= 45', 'age >= 46']
# all_age_query_strings = ['age < 26','age >= 26 & age <=60','age >60']
all_age_query_strings = ['age >= 10 & age <= 34', 'age >= 35 & age <= 45', 'age >= 46']
for age_query in all_age_query_strings:
    df_filtered_final = mkt_df_filtered.query(age_query)
    s = 0
    c = 0
    for index, row in df_filtered_final.iterrows():
        if row['y'] == "yes":
            s += 1
        c += row['campaign']
    print(s/c)
    print(df_filtered_final.shape)
    print(age_query)

In [None]:
# Age.
all_age_query_strings = ['age >= 10 & age <= 19', 'age >= 20 & age <= 29', 'age >= 30 & age <= 39', 'age >= 40 & age <= 49', 'age >= 50 & age <= 59','age >= 60 & age <= 69', 'age >= 70 & age <= 79', 'age >= 80 & age <= 89', 'age >= 90 & age <= 100']
all_age_query_strings = ['age >= 10 & age <= 34', 'age >= 35 & age <= 45', 'age >= 46']
# age_query_strings = ['age >= 10 & age <= 33', 'age >= 34 & age <= 45', 'age >= 46']
for age_query in all_age_query_strings:
    df_filtered_final = mkt_df_filtered.query(age_query)
    print(df_filtered_final.shape[0])
    print(age_query, len(df_filtered_final), compute_metric(df_filtered_final))

In [None]:
# Age
all_age_query_tuples = [(10, 20), (20, 30), (30, 40), (50, 60), (60, 70), (70, 80), (80, 90), (90, 100)]
ratios, all_age_query_strings = compute_metric_for_each_attribute_range(all_age_query_tuples, mkt_df_filtered, 'age')
print(ratios)
# find_combinations(all_age_query_strings, ratios)

## ---------------------------------------------------------------------------------

In [None]:
# Balance
all_bal_query_strings = ['balance >= -100000 & balance <= -1', 'balance >= 0 & balance < 1000', 'balance >= 1000 & balance < 2000', 'balance >= 2000 & balance < 3000', 'balance >= 3000 & balance < 4000','balance >= 4000 & balance < 5000', 'balance >= 5000 & balance < 6000', 'balance >= 6000 & balance < 7000', 'balance >= 7000 & balance < 8000', 'balance >= 8000 & balance < 9000', 'balance >= 9000 & balance < 10000','balance >= 10000 & balance < 11000', 'balance >= 11000 & balance < 12000', 'balance >= 12000 & balance < 13000', 'balance >= 13000 & balance < 14000', 'balance >= 14000 & balance < 15000', 'balance >= 15000 & balance < 16000', 'balance >= 16000 & balance < 17000','balance >= 17000 & balance < 18000', 'balance >= 18000 & balance < 19000', 'balance >= 19000 & balance < 19000', 'balance >= 20000']
all_bal_query_strings = ['balance >= -100000 & balance <= -1', 'balance >= 0 & balance <= 2000', 'balance > 2000 & balance <= 4000', 'balance > 4000 & balance <= 6000', 'balance > 6000 & balance <= 8000', 'balance > 8000 & balance <= 10000', 'balance > 10000 & balance <= 12000', 'balance > 12000 & balance <= 14000', 'balance > 14000 & balance <= 16000', 'balance > 16000 & balance <= 18000', 'balance > 18000 & balance <= 20000' , 'balance >= 20000']
all_bal_query_strings = ['balance >= -100000 & balance <= -1', 'balance >= 0 & balance <= 5000', 'balance > 5000 & balance <= 10000', 'balance > 10000 & balance <= 15000','balance > 15000 & balance <= 20000', 'balance >20000']
for bal_query in all_bal_query_strings:
    df_filtered_final = mkt_df_filtered.query(bal_query)
    print(bal_query, len(df_filtered_final), compute_metric(df_filtered_final))

In [None]:
# Balance
all_bal_query_tuples = [(-10000, 0), (0, 250), (250, 500), (500, 750), (750,1000), (1000, 2000), (2000, 3000), (3000, 4000), (4000, 5000), (5000, 6000), (6000, 7000), (8000, 100000)]
ratios, all_bal_query_strings = compute_metric_for_each_attribute_range(all_bal_query_tuples, train_df, 'balance')
find_combinations(all_bal_query_strings, ratios)

## Determining where to stop regarding the number of calls.

In [None]:
# This is to determine the maximum number of calls we should stop at!
all_ratios_calls = []
for i in range(1,57):
    query_str = 'campaign == ' + str(i)
    call_query_data = mkt_df_filtered.query(query_str)
    succ = 0
    calls = 0
    for lc, rw in call_query_data.iterrows():
        if rw['y'] == "yes":
            succ += 1
        calls += rw['campaign']
    all_ratios_calls.append(div(succ, calls))
for index, value in enumerate(all_ratios_calls):
    print(index+1, value)
plot_graph_new(all_ratios_calls, 56, True)

## Improving the success rate by optimizing the maximum calls made.

In [None]:
mkt_df_filtered.head()

In [None]:
max_calls_considered = 20

current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls_considered)]

result_ratios = [0.0 for i in range (1,max_calls_considered+1)]

for i in range(1, max_calls_considered+1):
    total_calls = 0
    total_succ = 0
    #query_str = "campaign <= {0}".format(i)
    #print(query_str)
    #df_filtered_campaign = mkt_df_filtered.query(query_str)
    for loc, row in mkt_df_filtered.iterrows():
        if row['y']  == "yes" and row['campaign'] <= i:
            total_succ += 1
        total_calls += min(i, row['campaign'])
    result_ratios[i-1] = div(total_succ , total_calls)
    print(i, result_ratios[i-1], total_succ, total_calls)

In [None]:
plot_graph_new(result_ratios, 20, True, "Ratio Per Call #")

In [None]:
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
mkt_df_filtered = mkt_df[(mkt_df['campaign']>=1) & (mkt_df['campaign']<=max_calls)]
mkt_df_filtered.shape

In [None]:
mkt_df_filtered.head(n=10)

In [None]:
mkt_df_filtered['poutcome'].value_counts()

In [None]:
mkt_df_filtered_successes = mkt_df_filtered.query("poutcome == 'success'")
print(mkt_df_filtered_successes.shape)
mkt_df_filtered_successes['previous'].value_counts()

In [None]:
mkt_df_filtered_successes['poutcome'].value_counts()

In [None]:
res = mkt_df_filtered_successes['campaign'].value_counts(normalize = False)
print(res)
print(res.values)
num_succ = [2561, 1401 , 618, 317, 139, 92, 47, 32, 21, 14, 16, 4, 6, 4, 4, 2, 6, 0, 0 ,1]

In [None]:
plot_graph_new(num_succ, 20, True, "Frequency of Contacts Made per Call #")

In [None]:
a = mkt_df_filtered.query("y == 'yes'").shape[0]
b = mkt_df_filtered.query("poutcome == 'success'").shape[0]
anb = mkt_df_filtered.query("y == 'yes' and poutcome == 'success'").shape[0]
print(anb/b)

In [None]:
for i in range(0, 4920, 60):
    res = mkt_df_filtered.query("duration >= {0} and duration <= {1}".format(str(i-60), str(i)))
    print(i, res.shape[0], res.query("y == 'yes'").shape[0])
# a = mkt_df_filtered.query("duration >= 0 and duration <= 180").shape[0]
# b = mkt_df_filtered.query("y == 'yes'").shape[0]
# anb = mkt_df_filtered.query("y == 'yes' and duration >= 0 and duration <= 1000").shape[0]
# print(anb/b)

In [None]:
# int(mkt_df_filtered['duration'].max()/60)+1
int(mkt_df_filtered['duration'].max()/60) + 1

In [None]:
a = mkt_df_filtered.query("y == 'yes' and contact == 'cellular'").shape[0]
b = mkt_df_filtered.query("y == 'yes' and contact == 'telephone'").shape[0]
c = mkt_df_filtered.query('y == "yes"').shape[0]
print(a, b, c)

## Irritability

In [None]:
succ_ratios = []
for i in range(1,21):
    num_calls = 0
    num_succ = 0
    for loc, row in mkt_df_filtered.iterrows():
        if row['campaign'] <= i:
            num_calls += row['campaign']
            if row['y'] == "yes":
                num_succ += 1
    succ_ratios.append(div(num_succ, num_calls))
succ_ratios

In [None]:
plt.plot([i for i in range (1,21)], succ_ratios)

In [None]:
time_taken_all = []
successes_all = []
for i in range(1,21):
    time_taken = 0
    num_succ = 0
    num_ppl = 0
    for loc, row in mkt_df_filtered.iterrows():
        num_ppl += 1
        if row['campaign'] <= i:
            time_taken += int(row['duration'])
            if row['y'] == "yes":
                num_succ += 1
    time_taken_all.append(time_taken/num_ppl)
    # successes_all.append(num_succ)

In [None]:
plt.plot([i for i in range(1,21)], successes_all)
plt.show()

In [None]:
plt.plot([i for i in range(1,21)], time_taken_all)

In [None]:
time_taken_all

In [None]:
time_taken_all

In [None]:
new_succ = [0 for i in range(1,21)]
new_succ[0] = successes_all[0]
for i in range(1,20):
    new_succ[i] = successes_all[i] - successes_all[i-1]
new_succ

In [None]:
new_dur = [0 for i in range(1,21)]
new_dur[0] = time_taken_all[0]
for i in range(1,20):
    new_dur[i] = time_taken_all[i] - time_taken_all[i-1]
for i in range(0,20):
    new_dur[i] /= 60
new_dur

In [None]:
time_taken_all = []
successes_all = []

for i in range(1,21):
    time_taken = 0
    num_succ = 0
    for loc, row in mkt_df_filtered.iterrows():
        if row['campaign'] == i:
            time_taken += int(row['duration']/60)
            if row['y'] == "yes":
                num_succ += 1
    time_taken_all.append(time_taken)
    successes_all.append(num_succ)


In [None]:
time_taken_all

In [None]:
successes_all

In [None]:
res = []
for i in range(0,20):
    res.append(div(successes_all[i],time_taken_all[i]))
res

In [None]:
plt.plot([i for i in range(1,21)], res)

In [None]:
# BACKUP

%%time

# Main code ... orchestrates everything!

# Splitting dataframe into data and result dataframes.
X = mkt_df_filtered.iloc[:,0:len(mkt_df_filtered.columns)-1]
y = mkt_df_filtered.iloc[:,-1]

cut_points = [0]

for cp in cut_points:

    for j in range(1,2):
        
        phase_batch = {}
        kf = KFold(n_splits=5, shuffle=True)
        i = 0
        for train_index, test_index in kf.split(X):
            i += 1

            train_df = mkt_df_filtered.iloc[train_index]
            test_df = mkt_df_filtered.iloc[test_index]

            # At this point, we can run computations for the success rate of each sub attribute and join
            # the sub-attributes based on the output of k-means.
            poss = []

            # Education.
            all_ed = ['tertiary', 'secondary', 'primary', 'unknown']
            metric_vals = compute_metric_for_each_attribute(all_ed, train_df, 'education')
            education_cmbs = find_combinations(all_ed, metric_vals)

            # Occupation.
            all_jobs = ['student', 'retired', 'unemployed', 'admin.', 'management', 'self-employed', 'technician', 'unknown', 'services', 'housemaid', 'blue-collar', 'entrepreneur']
            metric_vals = compute_metric_for_each_attribute(all_jobs, train_df, 'job')
            job_cmbs = find_combinations(all_jobs, metric_vals)

            # Marital.
            all_ms = ['married', 'single', 'divorced', 'unknown']
            metric_vals = compute_metric_for_each_attribute(all_ms, train_df, 'marital')
            marital_cmbs = find_combinations(all_ms, metric_vals)

            # Default
            all_def = ['no', 'yes', 'unknown']
            metric_vals = compute_metric_for_each_attribute(all_def, train_df, 'default')
            default_cmbs = find_combinations(all_def, metric_vals)

            # Loan
            all_ln = ['no', 'yes', 'unknown']
            metric_vals = compute_metric_for_each_attribute(all_ln, train_df, 'loan')
            loan_cmbs = find_combinations(all_ln, metric_vals)

            # Housing
            all_hs = ['no', 'yes', 'unknown']
            metric_vals = compute_metric_for_each_attribute(all_hs, train_df, 'housing')
            housing_cmbs = find_combinations(all_hs, metric_vals)

            poss.append(education_cmbs)
            poss.append(marital_cmbs)
            poss.append(job_cmbs)
            poss.append(default_cmbs)
            poss.append(loan_cmbs)
            poss.append(housing_cmbs)
            all_combs = list(itertools.product(*poss))

            # print("Number of combinations: ", len(all_combs)* len(age_query_strings) * len(balance_query_strings))

            # We can now go ahead and genreate the feature sets based on what was done previously.
            num_iter = 0
            combs_to_consider = {}
            fs_pick = {}

            # Setting up looping structures to generate all possibilities.
            for age_query in age_query_strings:
                df_filtered_final = train_df.query(age_query)
                for bal_query in balance_query_strings:
                    df_filtered_final_2 = df_filtered_final.query(bal_query)
                    for comb in all_combs:
                        dict_final_query = construct_dict(comb)
                        num_iter += 1
                        extracted_df = extract_rows_feature_set(df_filtered_final_2, dict_final_query)
                        key = (dict_final_query['education'], dict_final_query['job'], 
                               dict_final_query['marital'], dict_final_query['default'], 
                               dict_final_query['loan'], dict_final_query['housing'], 
                               bal_query, age_query)
                        n_rows = extracted_df.shape[0]
                        if n_rows > cp:
                            results = compute_expected_succ_per_call_rate_feature_set(extracted_df, max_calls)
                            max_loc = compute_optimal_call_no(results)
                            if max_loc != -1:
                                combs_to_consider[key] = {
                                                            'max_loc':max_loc + 1,
                                                            'best_rate':results[max_loc]['expected'], 
                                                            'overall_rate':results[max_calls-1]['expected'], 
                                                            'n_rows':n_rows, 
                                                            'results':results,
                                                            'fs_customers':None,
                                                            'valid': True
                                                         }
                                fs_pick[key] = {'current_ratio': 0.0, 'call_num':0, 'finished':False}
                            else:
                                print("Invalid FS ! -> ", n_rows)
            
            for fs_key in combs_to_consider.keys():
                fs_customers = find_all_cust_feature_set(fs_key, test_df)
                combs_to_consider[fs_key]['fs_customers'] = fs_customers

            
            # Testing Phase 1: Baseline test with shuffling of customers.

            call_check_points = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, 23500, 24000, 24500, 25000, 25500, 26000, 26500, 27000, 27500, 28000, 28500, 29000, 29500, 30000, 30500, 31000, 31500, 32000, 32500, 33000, 33500, 34000, 34500, 35000, 35500, 36000, 36500, 37000, 37500, 38000, 38500, 39000, 39500, 40000, 40500, 41000, 41500, 42000, 42500, 43000, 43500, 44000, 44500, 45000, 45500, 46000, 46500, 47000, 47500, 48000, 48500, 49000, 49500, 50000, 50500, 51000, 51500, 52000, 52500, 53000, 53500, 54000, 54500, 55000, 55500, 56000, 56500, 57000, 57500, 58000, 58500, 59000, 59500, 60000, 60500, 61000, 61500, 62000, 62500, 63000, 63500, 64000, 64500, 65000, 65500, 66000, 66500, 67000, 67500, 68000, 68500, 69000, 69500, 70000, 70500, 71000, 71500, 72000, 72500, 73000, 73500, 74000, 74500, 75000, 75500, 76000, 76500, 77000, 77500, 78000, 78500, 79000, 79500, 80000, 80500, 81000, 81500, 82000, 82500, 83000, 83500, 84000, 84500, 85000, 85500, 86000, 86500, 87000, 87500, 88000, 88500, 89000, 89500, 90000, 90500, 91000, 91500, 92000, 92500, 93000, 93500, 94000, 94500, 95000, 95500, 96000, 96500, 97000, 97500, 98000, 98500, 99000, 99500, 100000, 100500, 101000, 101500, 102000, 102500, 103000, 103500, 104000, 104500, 105000, 105500, 106000, 106500, 107000, 107500, 108000, 108500, 109000, 109500, 110000, 110500, 111000, 111500, 112000, 112500, 113000, 113500, 114000, 114500, 115000, 115500, 116000, 116500, 117000, 117500, 118000, 118500, 119000, 119500, 120000, 120500, 121000, 121500, 122000, 122500, 123000, 123500, 124000, 124500, 125000, 125500, 126000, 126500, 127000, 127500, 128000, 128500, 129000, 129500, 130000, 130500, 131000, 131500, 132000, 132500, 133000, 133500, 134000, 134500, 135000, 135500, 136000, 136500, 137000, 137500, 138000, 138500, 139000, 139500]
            result_ratios_p1 = []
            cp_loc = 0
            num_succ = 0
            num_calls = 0
            res = test_df.reindex(np.random.permutation(test_df.index))
            for loc, row in res.iterrows():
                if num_calls >= call_check_points[cp_loc]:
                    cp_loc += 1
                    result_ratios_p1.append((num_succ, num_calls))
                num_calls += row['campaign']
                if row['y'] == "yes":
                    num_succ += 1
            result_ratios_p1.append((num_succ, num_calls))


            # Testing Phase 2: Order how we call customer based on the overall s/c ratio.
            # In this instance we call everyone.

            persons_to_call_overall = {k: v for k, v in sorted(combs_to_consider.items(), key=lambda fs: fs[1]['overall_rate'], reverse = True)}
            num_succ = 0
            num_calls = 0
            result_ratios_p2 = []
            # print(type(persons_to_call_overall))
            result_ratios = []
            for key in persons_to_call_overall.keys():
                for loc, cust in persons_to_call_overall[key]['fs_customers'].iterrows():
                    num_calls += cust['campaign']
                    if cust['y'] == "yes":
                        num_succ +=1
                result_ratios_p2.append((num_succ, num_calls))


            # Testing Phase 3: Order how we call customers based on the maximum s/c ratio.
            # The maximum s/c ratio is linked to a call number. Use this to determine number of successes.

#             persons_to_call_max = {k: v for k, v in sorted(combs_to_consider.items(), key=lambda fs: fs[1]['best_rate'], reverse = True)}
#             num_succ = 0
#             num_calls = 0
#             result_ratios_p3 = []
#             # print(type(persons_to_call_max))
#             result_ratios_p3 = []
#             for key in persons_to_call_max.keys():
#                 max_calls_fs = persons_to_call_max[key]['max_loc']
#                 for loc, cust in persons_to_call_max[key]['fs_customers'].iterrows():
#                     cust_calls = cust['campaign']
#                     if cust_calls <= max_calls_fs:
#                         num_calls += cust_calls
#                         if cust['y'] == "yes":
#                             num_succ +=1
#                     else:
#                         num_calls += max_calls_fs
#                 result_ratios_p3.append((num_succ, num_calls))


            # Testing Phase 4: Order how we call customers from feature sets.
            # Use the approach of calculating gradient at each call point and use this to order
            # how the calls are made.

            result_ratios_p4 = []
            total_s = 0
            total_c = 0

            for k in range(1,21):
                for key in combs_to_consider.keys():
                    fs_ref = combs_to_consider[key]['results']
                    pos = fs_pick[key]['call_num']
                    if pos < 20:
                        fs_pick[key]['call_num'] += 1
                        if pos == 0:
                            fs_pick[key]['current_ratio'] = div(fs_ref[pos]['succ'], fs_ref[pos]['total_calls'])
                        else:
                            fs_pick[key]['current_ratio'] = div((fs_ref[pos]['succ'] - fs_ref[pos-1]['succ']), (fs_ref[pos]['total_calls'] - fs_ref[pos-1]['total_calls']))
                    else:
                        fs_pick[key]['finished'] = True

                optimal_choices = {k: v for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['current_ratio'], reverse = True)}

                for key in optimal_choices.keys():
                    if optimal_choices[key]['finished'] == False:
                        for loc, row in combs_to_consider[key]['fs_customers'].iterrows():
                            if row['campaign'] == (optimal_choices[key]['call_num']):
                                total_c += 1
                                if row['y'] == "yes":
                                    total_s += 1
                            elif row['campaign'] > (optimal_choices[key]['call_num']):
                                total_c += 1
                        result_ratios_p4.append((total_s, total_c, k))

#             result_ratios_p4 = []
#             total_s = 0
#             total_c = 0

#             for k in range(1,21):
#                 for key in combs_to_consider.keys():
#                     fs_ref = combs_to_consider[key]['results']
#                     # Sort of a patch to the results so subtracting would be easier.
#                     fs_ref.insert(0, {'succ':0, 'total_calls':0, 'expected':0.0})
#                     e_pos = fs_pick[key]['end'] # This should initially be 1
#                     s_pos = fs_pick[key]['end'] - 1 # This should initially be 0
#                     curr_ratio = 0.0
#                     if e_pos < 21:
#                         curr_ratio = div((fs_ref[e_pos]['succ'] - fs_ref[s_pos]['succ']), (fs_ref[e_pos]['total_calls'] - fs_ref[s_pos]['total_calls']))
#                         if curr_ratio != 0.0:
#                             e_pos += 1
#                             s_pos += 1
#                         else:
#                             while e_pos < 21 and curr_ratio == 0.0:
#                                 e_pos += 1
#                                 curr_ratio = div((fs_ref[e_pos]['succ'] - fs_ref[s_pos]['succ']), (fs_ref[e_pos]['total_calls'] - fs_ref[s_pos]['total_calls']))
#                         fs_pick[key]['end'] = e_pos
#                         fs_pick[key]['start'] = s_pos
#                         fs_pick[key]['current_ratio'] = curr_ratio
#                         if e_pos >= 21:
#                             fs_pick[key]['finished'] = True
#                     else:
#                         fs_pick[key]['finished'] = True

#                 optimal_choices = {k: v for k, v in sorted(fs_pick.items(), key=lambda val: val[1]['current_ratio'], reverse = True)}
# #                 print(optimal_choices)
#                 for key in optimal_choices.keys():
#                     if optimal_choices[key]['finished'] == False:
#                         start = optimal_choices[key]['start']
#                         end = optimal_choices[key]['end']
#                         for s in range(start, end, 1):
#                             for loc, row in combs_to_consider[key]['fs_customers'].iterrows():
#                                 if row['campaign'] == s:
#                                     total_c += 1
#                                     if row['y'] == "yes":
#                                         total_s += 1
#                                 elif row['campaign'] > s:
#                                     total_c += 1
#                         result_ratios_p4.append((total_s, total_c, k))
                    
            
            # Testing Phase 5: Using a Decision Tree.
            # Use the copy of the mkt dataframe for this portion.
            
#             result_ratios_p5 = []
#             total_succ = 0
#             total_calls = 0
#             cp_loc = 0
            
#             train_x = train_df_cpy.iloc[:,0:len(train_df_cpy.columns)-1]
#             train_y = train_df_cpy.iloc[:,-1]

#             test_x = test_df_cpy.iloc[:,0:len(test_df_cpy.columns)-1]
#             test_y = test_df_cpy.iloc[:,-1]
            
#             clf = DecisionTreeClassifier()
#             clf = clf.fit(train_x, train_y)
#             test_y_pred = clf.predict(test_x)
            
#             for index, outcome in enumerate(test_y_pred):
#                 num_calls_made = int(test_x.iloc[index]['campaign'])
#                 actual_result = int(test_y.iloc[index]) == 1
#                 if total_calls >= call_check_points[cp_loc]:
#                     cp_loc += 1
#                     result_ratios_p5.append((total_succ, total_calls))
#                 if outcome == 1:
#                     total_calls += num_calls_made
#                     if actual_result:
#                         total_succ += 1
#             result_ratios_p5.append((total_succ, total_calls))
            
            # Final Part .... Put results in dict and append to array.
            
            phase_batch_key = str(j) + "_" + str(i)
            phase_batch[phase_batch_key] = {'p1':result_ratios_p1, 'p2':result_ratios_p2, 'p4':result_ratios_p4}
            
    with open('idk_results_cp_' + str(cp) +'.json', 'w') as fp:
        json.dump(phase_batch, fp)

In [None]:
job_cmbs