In [4]:
import os
import math
import random
import operator
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from operator import itemgetter

In [5]:
def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df

def compute_ratio(num_yes, num_no):
    if num_yes == 0 and num_no == 0:
        prob = 0.0
    else:
        prob = num_yes / (num_yes + num_no)
    return prob

In [6]:
def find_feature_set_calls(train_data, lookup_key, no_calls):
    result = None
    if lookup_key in train_data:
        result = []
        for i in range(1, no_calls+1):
            new_key = (lookup_key[0], lookup_key[1], lookup_key[2], i, lookup_key[4])
            result.append(train_data[new_key])
    return result

In [7]:
def compute_probs2(indicies, mkt_df, calls_end):
    age_st = 10
    age_end = 100
    age_inc = 10
    calls_st = 1
    calls_inc = 1
    
    data_df = mkt_df.iloc[indicies, :]

    # Before anything can begin, we need to remove the rows with undefined entries.
    cleaned_mkt_df = data_df[(data_df['job']!= "unknown") & (data_df['marital']!="unknown") & (data_df['education']!="unknown")]
    # Add another column to denote whether the row has been processed already.
    # cleaned_mkt_df.loc[:,'used'] = 0

    # List to store all the results.
    all_fs = {}
    age_range_group = 0
    
    # Perform checking of yes and no statuses.
    for i in range(age_st, age_end, age_inc):
        # We obtain a subset of rows where the age is >= begin_age and < end_age.
        data = cleaned_mkt_df[(cleaned_mkt_df['age']>=i) & (cleaned_mkt_df['age']<(i+10))]
        age_range_group += 1
        all_keys_age_range = {}
        for index, row in data.iterrows():
            # Extract values from rows.
            job = str(row['job'])
            marital = str(row['marital'])
            education = str(row['education'])
            no_calls = row['campaign']
            if no_calls >=1 and no_calls <=calls_end:
                key=(marital, job, education, age_range_group)
                if key not in all_fs:
                    all_fs[key] = {}
                    all_fs[key]['probs'] = []
                    for i in range(1, calls_end + 1):
                        all_fs[key]['probs'].append({'y_count':0, 'n_count':0, 'no_calls':i, 'prob':0.0, 'value':0.0})
                if row['y'] == 1:
                    all_fs[key]['probs'][no_calls -1]['y_count'] += 1
                else:
                    all_fs[key]['probs'][no_calls -1]['n_count'] += 1
    
    # Compute probabilities.
    for key in all_fs.keys():
        for i in range(0, calls_end):
            ref = all_fs[key]['probs'][i]
            num_yes = ref['y_count']
            num_no = ref['n_count']
            ref['prob'] =  compute_ratio(num_yes, num_no)
    return all_fs

In [24]:
def visualize_value_calls_graph(mapping, i = 0):
    
    x = np.arange(1, 11, 1)
    y = [ item['value'] for item in mapping['probs']]

    fig, ax = plt.subplots()
    ax.plot(x, y)
    
    ax.set(xlabel='Number of Calls', ylabel='Value (P(k)*v) - (k*c)', title='Potential Value Against the Number of Calls')
    ax.grid()
#     fig.savefig("plot_" + str(i) + ".png")
    plt.show()

In [52]:
def driver(mkt_df, no_calls_accepted, value_cost_ratio):
    print("Running driver code!\n")
    # Encoding the y value!
    enc = LabelEncoder()
    enc.fit(mkt_df['y'])
    mkt_df['y'] = enc.transform(mkt_df['y'])

    # Converting dataframe to an array (numpy).
    arr = mkt_df.values

    # Separate the features from the result (subscribe or not subscribe).
    X = [:, :-1]
    y = arr[:,-1]
    kf = KFold(n_splits = 5)
    kf.get_n_splits(X)
    i = 0
    times_ran = 0
    # This is for every iteration using cross validation (with n splits).
    for train_index, test_index in kf.split(X):
        times_ran += 1
        # Do probability computation here (Train Phase)
        train_data = compute_probs2(train_index, mkt_df, no_calls_accepted)
        # Compute the optimal value of k for every feature (marital, age, education, job) in the dataset.
        for key in train_data.keys():
            value = 0.0
            # Iterate all probabilities for this feature set. Compute the value gained. Stop when we have a value < 0.0.
            for i in range(0, no_calls_accepted):
                ref = train_data[key]['probs'][i]
                result = ref['prob'] * (value_cost_ratio) - i
                train_data[key]['probs'][i]['value'] = result
                if (result) < 0.0:
                    break
                else:
                    value += result
            # We are finished computed the value and then store it for the corresponding key.
            train_data[key]['k_optimal'] = i - 1
            train_data[key]['value_overall'] = value
            # Visualize the graph showing the value against the number of calls here.
#             visualize_value_calls_graph(train_data[key])
        # Testing Phase
        num_not_found = 0
        pos_outcomes = 0
        neg_outcomes = 0
        actual_gain = 0.0
        theory_gain = 0.0
        test = mkt_df.iloc[test_index, :]
        for index, row in test.iterrows():
            age = row['age']
            job = str(row['job'])
            marital = str(row['marital'])
            education = str(row['education'])
            no_calls = int(row['campaign'])
            if no_calls >=1 and no_calls <=no_calls_accepted:
                if row['y'] == 1:
                    key_ref = (marital, job, education, int(age/10))
                    if key_ref in train_data.keys():
                        dict_ref = train_data[key_ref]
                        optimal_calls = dict_ref['k_optimal']
                        if optimal_calls <= no_calls:
                            actual_gain += ((dict_ref['probs'][no_calls-1]['prob'] * value_cost_ratio) - no_calls)
                            theory_gain += ((dict_ref['probs'][optimal_calls]['prob'] * value_cost_ratio) - optimal_calls)
                            pos_outcomes += 1
                        else:
                            neg_outcomes += 1
                    else:
                        num_not_found += 1
        print("Iteration: ", times_ran)
        print("Misses: ", num_not_found)
        print("Positive Outcomes: ", pos_outcomes)
        print("Actual Gain:", actual_gain)
        print("Theoretical Gain:", theory_gain)
        print('-----------------------------------')
        print("Negative Outcomes: ", neg_outcomes)
        print("\n\n")

In [53]:
current_dir = os.getcwd()
mkt_df = load_file(current_dir + '/bank-full.csv')
driver(mkt_df, 10, 100)

Running driver code!

Iteration:  1
Misses:  12
Positive Outcomes:  100
Actual Gain: 621.1969563752689
Theoretical Gain: 487.8644636050126
-----------------------------------
Negative Outcomes:  185



Iteration:  2
Misses:  26
Positive Outcomes:  257
Actual Gain: 1440.320369535971
Theoretical Gain: 1985.63324985727
-----------------------------------
Negative Outcomes:  210



Iteration:  3
Misses:  11
Positive Outcomes:  240
Actual Gain: 1275.230842628669
Theoretical Gain: 1299.8306782360796
-----------------------------------
Negative Outcomes:  288



Iteration:  4
Misses:  46
Positive Outcomes:  287
Actual Gain: 3126.7477078074044
Theoretical Gain: 2169.9574751623777
-----------------------------------
Negative Outcomes:  727



Iteration:  5
Misses:  187
Positive Outcomes:  1210
Actual Gain: 14266.584611965585
Theoretical Gain: 8199.729327119921
-----------------------------------
Negative Outcomes:  1456



