In [4]:
import os
import math
import random
import operator
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math, itertools
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from operator import itemgetter
from statistics import mean
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter

In [5]:
# Helper Functions

def load_file(data_file_path):
    data_df = pd.read_csv(data_file_path, delimiter=";")
    return data_df
  
    
def plot_graph_new(results, max_calls, list_passed, title):
    x_pts = [i+1 for i in range(0, max_calls)]
    if list_passed:
        y_pts = results
    else:    
        y_pts = [results[i]['expected'] for i in range(0, max_calls)]
    plt.title(title)
    plt.plot(x_pts, y_pts)
    plt.axvline(x=0, color ="black")
    plt.axhline(y=0, color ="black")
    plt.xticks(np.arange(1, max_calls+1, 1))
    plt.show()
    

# def plot_graph_both_axes(ratios, no_calls):
    

def div(a,b):
    if int(b) == 0:
        return 0.0
    else:
        return a/b
    

# Used for creating all possible combinations of the features.
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = iterable
    return itertools.chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


def convert(list): 
    return tuple(list) 


def construct_dict(feature_comb):
    new_dict = {}
    new_dict['education'] = convert(feature_comb[0])
    new_dict['job'] = convert(feature_comb[2])
    new_dict['marital'] = convert(feature_comb[1])
    new_dict['default'] = convert(feature_comb[3])
    new_dict['loan'] = convert(feature_comb[4])
    new_dict['housing'] = convert(feature_comb[5])
    return new_dict


# This was the old metric (reward per call rate).
# def compute_expected_reward_feature_set_new(fs_df, no_calls_considered):
#     expected_values_calls = []
#     len_df = len(fs_df)
#     for i in range(1, no_calls_considered + 1):
#         expected_values_calls.append({'neg_value':0.0, 'pos_value':0.0, 'count':0, 'expected':0.0})
#         for index, row in fs_df.iterrows():
#             no_calls = row['campaign']
#             if no_calls <= i:
#                 if row['y'] == "yes":
#                     expected_values_calls[i-1]['pos_value'] += ((no_calls_considered+1) - no_calls)
#                 else:
#                     expected_values_calls[i-1]['neg_value'] += (-no_calls)
#             else:
#                 expected_values_calls[i-1]['neg_value'] += (-i)
#             expected_values_calls[i-1]['count'] += 1
#     for loc, item in enumerate(expected_values_calls):
#         expected_values_calls[loc]['expected'] = (expected_values_calls[loc]['pos_value'] + expected_values_calls[loc]['neg_value'])/len_df
#     return expected_values_calls


# This is the new metric (success per call rate).
def compute_expected_succ_per_call_rate_feature_set(fs_df, no_calls_considered):
    expected_values_call_nums = []
    for i in range(1, no_calls_considered + 1):
        expected_values_call_nums.append({'succ':0, 'total_calls':0, 'expected':0.0})
        for index, row in fs_df.iterrows():
            no_calls = row['campaign']
            if no_calls <= i:
                if row['y'] == "yes":
                    expected_values_call_nums[i-1]['succ'] += 1
                    # WHAT SHIT ... (Fixed)
                expected_values_call_nums[i-1]['total_calls'] += no_calls
            else:
                expected_values_call_nums[i-1]['total_calls'] += i
    for loc, item in enumerate(expected_values_call_nums):
        expected_values_call_nums[loc]['expected'] = div(item['succ'], item['total_calls'])
    return expected_values_call_nums


def compute_optimal_call_no(results):
    max_loc = max(range(len(results)), key=lambda index: results[index]['expected'])
#     if results[max_loc]['expected'] <= 0.25:
#         return -1
#     else:
    return max_loc


# Given a dictionary of what attributes comprise a feature set, we can get all rows corresponding to this feature set.
def extract_rows_feature_set(fs_df, feature_labels = {'education':['tertiary', 'unknown'], 
                                                      'job':['management', 'technician', 'blue-collar'], 
                                                      'marital':['single'], 'default':['no'], 
                                                      'housing':['no'], 'loan':['no']}):
    for key in feature_labels:
        feature_labels_query_str = ''
        arr = feature_labels[key]
        for label in arr:
            feature_labels_query_str += (key + ' == "'+ label + '" | ')
        feature_labels_query_str = feature_labels_query_str[:-3]
        fs_df = fs_df.query(feature_labels_query_str)
    return fs_df


def find_matching_attribute_comb(row_value, all_combs):
    query = None
    for comb in all_combs:
        for item in comb:
            if item == row_value:
                query = comb
    return query


def compute_metric(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += row['campaign']
    return div(total_successes, total_calls)

def compute_metric_2(df):
    total_calls = 0
    total_successes = 0
    for loc, row in df.iterrows():
        if row['y'] == "yes":
            total_successes += 1
        total_calls += min(row['campaign'], )
    return div(total_successes, total_calls)


def compute_metric_for_each_attribute(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    for index, value in enumerate(all_values):
        v_query = "{0} == '{1}'".format(attrib, value)
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals


def compute_metric_for_each_attribute_range(all_values, df, attrib):
    metric_vals = np.zeros(shape=(len(all_values),1))
    query_strings = []
    for index, value in enumerate(all_values):
        v_query = "{0} >= {1} & {2} < {3}".format(attrib, value[0], attrib, value[1])
        dataset_query = df.query(v_query)
        metric_val = compute_metric(dataset_query)
        metric_vals[index] = metric_val
        query_strings.append(v_query)
#         print(v_query, metric_val, dataset_query.shape)
    return metric_vals, query_strings

In [6]:
def find_combinations(sub_attributes, ratios):
    num_iter = len(ratios)
    sil_scores = []
    # Making use of the K-Means algorithm ... number of centroids are from 2 to n-1.
    for clust_num in range(2, num_iter):
        kmeans = KMeans(n_clusters = clust_num)
        kmeans.fit(ratios.reshape(-1,1))
        results = kmeans.labels_
        sil_scores.append((silhouette_score(ratios.reshape(-1,1), results, metric='euclidean'), results, clust_num))
    # We make use of the silhouette score to determine the ideal number of centroids.
    sorted_sil_scores = sorted(sil_scores, key=lambda x: x[0], reverse = True)
    # We then use this ideal number of centroids to determine which sub attributes should be aggregated.
    joined_sub_attributes = []
    for i in range(0, sorted_sil_scores[0][2]):
        joined_sub_attributes.append([])
    join_list = sorted_sil_scores[0][1]
    for index, value in enumerate(join_list):
        pos = join_list[index]
        joined_sub_attributes[pos].append(sub_attributes[index])
    return_joined_sub_attributes = []
    for arr in joined_sub_attributes:
        similar_els_gp = []
        for item in arr:
            similar_els_gp.append(str(item))
        return_joined_sub_attributes.append(similar_els_gp)
#     print(return_joined_sub_attributes)
    return return_joined_sub_attributes

# The following is the format of the way in which this method should be called.
# find_combinations(['a', 'b', 'c', 'd'], np.array([1, 4, 7, 90]), "job").