In [3]:
import os
import math
import random
import numpy as np
import pandas as pd

In [4]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from operator import itemgetter

In [5]:
def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df

def compute_ratio(num_yes, num_no):
    if num_yes == 0 and num_no == 0:
        prob = 0.0
    else:
        prob = num_yes / (num_yes + num_no)
    return prob

In [6]:
def find_feature_set_calls(train_data, lookup_key, no_calls):
    result = None
    if lookup_key in train_data:
        result = []
        for i in range(1, no_calls+1):
            new_key = (lookup_key[0], lookup_key[1], lookup_key[2], i, lookup_key[4])
            result.append(train_data[new_key])
    return result

In [7]:
def compute_probs2(indicies, mkt_df, calls_end):
    age_st = 10
    age_end = 100
    age_inc = 10
    calls_st = 1
    calls_inc = 1
    
    data_df = mkt_df.iloc[indicies, :]

    # Before anything can begin, we need to remove the rows with undefined entries.
    cleaned_mkt_df = data_df[(data_df['job']!= "unknown") & (data_df['marital']!="unknown") & (data_df['education']!="unknown")]
    # Add another column to denote whether the row has been processed already.
    # cleaned_mkt_df.loc[:,'used'] = 0

    # List to store all the results.
    all_fs = {}
    age_range_group = 0
    
    # Perform checking of yes and no statuses.
    for i in range(age_st, age_end, age_inc):
        # We obtain a subset of rows where the age is >= begin_age and < end_age.
        data = cleaned_mkt_df[(cleaned_mkt_df['age']>=i) & (cleaned_mkt_df['age']<(i+10))]
        age_range_group += 1
        all_keys_age_range = {}
        for index, row in data.iterrows():
            # Extract values from rows.
            job = str(row['job'])
            marital = str(row['marital'])
            education = str(row['education'])
            no_calls = row['campaign']
            if no_calls >=1 and no_calls <=calls_end:
                key=(marital, job, education, no_calls, age_range_group)
                if key not in all_fs:
                    for i in range(1, calls_end+1):
                        key_insert = (marital, job, education, i, age_range_group)
                        all_fs[key_insert] = {'y_count':0, 'n_count':0, 'prob':0.0}
                if row['y'] == 1:
                    all_fs[key]['y_count'] += 1
                else:
                    all_fs[key]['n_count'] += 1
    
    # Compute probabilities.
    for key in all_fs.keys():
        ref = all_fs[key]
        num_yes = ref['y_count']
        num_no = ref['n_count']
        ref['prob'] =  compute_ratio(num_yes, num_no)
    return all_fs

In [14]:
import operator

def driver(mkt_df, no_calls_accepted, queue_size):
    # Encoding the y value!
    enc = LabelEncoder()
    enc.fit(mkt_df['y'])
    mkt_df['y'] = enc.transform(mkt_df['y'])
    # mapping = dict(zip(enc.classes_, range(1, len(enc.classes_)+1)))
    # print(mapping)

    # Converting dataframe to an array (numpy).
    arr = mkt_df.values

    # Separate the features from the result (subscribe or not subscribe).
    X = arr[:, :-1]
    y = arr[:,-1]
    kf = KFold(n_splits = 5)
    kf.get_n_splits(X)

    times_ran = 0
    # This is for every iteration using cross validation (with n splits).
    for train_index, test_index in kf.split(X):
        times_ran += 1
        # Do probability computation here.
        train_data = compute_probs2(train_index, mkt_df, no_calls_accepted)
        test = mkt_df.iloc[test_index, :]
        # For every element in the test array, we can find the probability in our database.
        q_valid_dict = {}
        num_not_found = 0
        print("Number of test entries: ", len(test_index))
        # Doing a small test here.
        # print(find_feature_set_calls(train_data, ('married', 'management', 'tertiary', 5, 2), no_calls_accepted))
        for index, row in test.iterrows():
            age = row['age']
            job = str(row['job'])
            marital = str(row['marital'])
            education = str(row['education'])
            no_calls = int(row['campaign'])
            # We determine if the outcome is successful. If so, we add this key to our dictionary.
            if row['y'] == 1:
                key_ref_new = (marital, job, education, no_calls + 1, int(age/10))
                if key_ref_new in train_data.keys():
                    if key_ref_new in q_valid_dict:
                        q_valid_dict[key_ref_new]['count'] += 1
                    else:
                        q_valid_dict[key_ref_new] = {'count':1, 'prob':train_data[key_ref_new]['prob']}
                else:
                    num_not_found += 1
        # We sort all items in the dictionary by probability (desc order).
        print("Number of test entries not found: ", num_not_found)
        if len(q_valid_dict) > 0:
            sorted_queue = sorted(q_valid_dict.items(), key=lambda kv: kv[1]['prob'], reverse = True)
        avg = 0.0
        for i in range(0, 100):
            avg += sorted_queue[i][1]['prob']
        print("Average success rate for first 100 users is: ", avg/100)
        print("-----------------------------------\n")

In [15]:
if __name__ == '__main__':
    current_dir = os.getcwd()
    mkt_df = load_file(current_dir + '/bank-full.csv')
    driver(mkt_df, 10, 100)

Number of test entries:  9043
Number of test entries not found:  20
Average success rate for first 100 users is:  0.16954129133698634
-----------------------------------

Number of test entries:  9042
Number of test entries not found:  44
Average success rate for first 100 users is:  0.23738241992856485
-----------------------------------

Number of test entries:  9042
Number of test entries not found:  32
Average success rate for first 100 users is:  0.22180585938056402
-----------------------------------

Number of test entries:  9042
Number of test entries not found:  54
Average success rate for first 100 users is:  0.3557918288826646
-----------------------------------

Number of test entries:  9042
Number of test entries not found:  192
Average success rate for first 100 users is:  0.36318082508986776
-----------------------------------

