In [1]:
import os
import math
import random
import operator
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import math

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from operator import itemgetter

In [2]:
def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df

In [3]:
def compute_ratio(pos, neg):
    if neg == 0:
        prob = 0.0
    else:
        prob = pos / neg
    return prob

In [4]:
def visualize_value_calls_graph(mapping, key):
    # x and y values to plot.
    x = np.arange(1, 11, 1)
    y = [ item['value'] for item in mapping['probs']]
    fig, ax = plt.subplots()
    ax.plot(x, y)
    #Setting title and labels.
    title = "Potential Value Against the Number of Calls for: " +  str(key)
    ax.set(xlabel='Number of Calls', ylabel='Value (P(k)*v) - (k*c)', title=title)
    # plt.axvline(x=mapping['k_optimal'], color ="red")
    # Separating the profitable and non profitable call numbers.
    plt.axvspan(0, mapping['k_optimal'], color='green', alpha=0.5)
    plt.axvspan(mapping['k_optimal'], 10, color='red', alpha=0.5)
    # plt.axvline(x=0, color ="black")
    # plt.axhline(y=0, color ="black")
    ax.grid()
    # fig.savefig("plot_" + str(i) + ".png")
    print("Total People in Feature Set: ", mapping['total'])
    print("Optimal k: ", mapping['k_optimal'])
    print("Expected Value Per Customer: ", mapping['expected_optimal'])
    print("Probabilities: ")
    for item in mapping['probs']:
        print(item)
        print("----------- \n")
    plt.show()

In [5]:
def build_compute_probs2(mkt_df, calls_end):
    age_st = 10
    age_end = 100
    age_inc = 10
    calls_st = 1
    calls_inc = 1
    
    data_df = mkt_df

    # Before anything can begin, we need to remove the rows with undefined entries.
    cleaned_mkt_df = data_df[(data_df['job']!= "unknown") & (data_df['marital']!="unknown") & (data_df['education']!="unknown")]
    # Add another column to denote whether the row has been processed already.

    # Dictionary to store all the results.
    all_fs = {}
    age_range_group = 0
    
    # Perform checking of yes and no statuses.
    for i in range(age_st, age_end, age_inc):
        # We obtain a subset of rows where the age is >= begin_age and < end_age.
        data = cleaned_mkt_df[(cleaned_mkt_df['age']>=i) & (cleaned_mkt_df['age']<(i+10))]
        age_range_group += 1
        all_keys_age_range = {}
        for index, row in data.iterrows():
            # Extract values from rows.
            job = str(row['job'])
            marital = str(row['marital'])
            education = str(row['education'])
            no_calls = row['campaign']
            if no_calls >=1 and no_calls <=calls_end:
                key=(marital, job, education, age_range_group)
                if key not in all_fs:
                    all_fs[key] = {}
                    all_fs[key]['probs'] = []
                    for i in range(1, calls_end + 1):
                        all_fs[key]['probs'].append({'y_count':0, 'n_count':0, 'no_calls':i, 'prob':0.0, 'value':0.0})
                if row['y'] == 1:
                    all_fs[key]['probs'][no_calls -1]['y_count'] += 1
                else:
                    all_fs[key]['probs'][no_calls -1]['n_count'] += 1
    
    # Compute probabilities.
    for key in all_fs.keys():
        # Computing the total amount of calls.
        total_calls = 0
        for i in range(0, calls_end):
            ref = all_fs[key]['probs'][i]
            total_calls += ref['y_count']
            total_calls += ref['n_count']
        all_fs[key]['total'] = total_calls
        # Based on the total number of calls, we can compute the probability for each call.
        for i in range(0, calls_end):
            ref = all_fs[key]['probs'][i]
            num_yes = ref['y_count']
            num_no = ref['n_count']
            ref['prob'] =  compute_ratio(num_yes, total_calls)
            total_calls -= (num_yes + num_no)
    return all_fs

In [26]:
def compute_optimal_call(train_data, no_calls_accepted, value, cost):
    # Computing the value to cost ratio.
    value_cost_ratio = value / cost
    
    # Compute the optimal value of k for every feature (marital, age, education, job) in the dataset.
    for key in train_data.keys():
        i_stop = 0
        neg_gain = False
        
        # Iterate all calls for this feature set. Compute the value gained with respect to the probability. 
        # Stop when we have a value < 0.0. This is used to find our optimal call number.
        prob_value = []
        prob_value_summation = []
        for i in range(0, no_calls_accepted):
            ref = train_data[key]['probs'][i]
            ref['value'] = ((ref['y_count'] * value) - ((ref['n_count'] + ref['y_count']) * cost * (i + 1)))
            prob_value.append((ref['prob'] * value_cost_ratio) - ((i+1)))
       
        for i in range(0, no_calls_accepted):
            total_gain = 0.0
            for j in range(0, i + 1):
                total_gain += prob_value[j]
            prob_value_summation.append(total_gain)
        optimal_call_0 = prob_value_summation.index(max(prob_value_summation)) + 1
#         print("Best Index V1 is: ", optimal_call_0)
        
        optimal_call_1 = None
        for i in range(0, no_calls_accepted):
            if prob_value[i] < 0:
                optimal_call_1 = i
                break
#         print("Best Index V2 is: ", optimal_call_1)
    
        train_data[key]['k_optimal'] = optimal_call_0
        
        # Compute the expected value from this feature set using our optimal call number.
        sum_expected_optimal = 0.0
        total = 0
        for i in range(0, train_data[key]['k_optimal']):
            ref = train_data[key]['probs'][i]
            total += ref['y_count'] + ref['n_count']
            sum_expected_optimal += (ref['value'] * ref['prob'])
#         train_data[key]['expected_optimal'] = compute_ratio(sum_expected_optimal, train_data[key]['k_optimal'])
        if total == 0:
            train_data[key]['expected_optimal'] = 0.0
        else:
            train_data[key]['expected_optimal'] = (sum_expected_optimal/total)
        
        # Visualize the graph showing the value against the number of calls here.
#         visualize_value_calls_graph(train_data[key], key)
#         print("\n")
    return train_data

In [27]:
def compute_overall_gain(mkt_df_new, mkt_df_old, value, cost):
    # Using our optimized method.
    num_keys = len(mkt_df_new)
    total_value_new_method = 0.0
    for key in mkt_df_new.keys():
        ref = mkt_df_new[key]
        total_value_new_method += (ref['total'] * ref['expected_optimal'])
    print(total_value_new_method)
    
    # Using the given data.
    num_yes = 0
    data = mkt_df_old[(mkt_df_old['campaign']>=1) & (mkt_df_old['campaign']<=10)]
    total_value_old_method = 0.0
    cnt = 0
    for index, row in data.iterrows():
        no_calls = row['campaign']
        if row['y'] == 1:
            num_yes += 1
            total_value_old_method += (value - (no_calls * cost))
        else:
            total_value_old_method += -(no_calls * cost)
        cnt += 1
    print(total_value_old_method)
    print("Total yes is : ",num_yes)

In [28]:
def driver(mkt_df_old, no_calls_accepted, value, cost):
    
    # Encoding the y values.
    enc = LabelEncoder()
    enc.fit(mkt_df_old['y'])
    mkt_df_old['y'] = enc.transform(mkt_df_old['y'])

    # Construct dictionary of keys and compute probabilities.
    mkt_df_new = build_compute_probs2(mkt_df, no_calls_accepted)
    
    # For each key, compute the gain for each call and the optimal call number.
    mkt_df_new = compute_optimal_call(mkt_df_new, no_calls_accepted, value, cost)
    
    # Find the average expected number of calls and the average optimal value - based on all feature sets.
    compute_overall_gain(mkt_df_new, mkt_df_old, value, cost)

In [29]:
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
driver(mkt_df, 10, 10, 0.1)

5141.818456798122
41936.1000000028
Total yes is :  5242
