# Predicting Telephonic Marketing Outcome for Bank Term Deposit Final Notebook

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Utility-Functions-for-ML-Pipeline" data-toc-modified-id="Utility-Functions-for-ML-Pipeline-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Utility Functions for ML Pipeline</a></span></li><li><span><a href="#Final-call-to-ML-pipeline" data-toc-modified-id="Final-call-to-ML-pipeline-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Final call to ML pipeline</a></span></li><li><span><a href="#Conclusions" data-toc-modified-id="Conclusions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Conclusions</a></span></li><li><span><a href="#References" data-toc-modified-id="References-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>References</a></span></li></ul></div>

## Utility Functions for ML Pipeline

In [1]:
#function that will return the sine of the feature
def return_sin(x):
    return(np.sin(x))

In [2]:
#These functions will convert the numerical data to categorical data
#pdays, cons.price.idx, cons.conf.idx, nr.employee


#convert pdays to pdays_cat
def convert_pdays_cat(x):
    #threshold=512.5 if t<=512.5 then pdays_cat_yes else pdays_cat_no
    x['pdays_cat'] = np.where(x['pdays'] <= 513, 'pdays_cat_yes',
                              'pdays_cat_no')


#convert cons.price.idx to cons.price.idx_cat
def convert_cons_price_idx_cat(x):
    #threshold=92.868 if t<=92.868 then cons_price_idx_cat_yes else cons_price_idx_cat_no
    x['cons.price.idx_cat'] = np.where(x['cons.price.idx'] <= 92.868,
                                       'cons_price_idx_yes',
                                       'cons_price_idx_no')


#convert cons.conf.idx to cons.conf.idx_cat
def convert_cons_conf_idx_cat(x):
    #threshold = -35.45 if t<=-35.45 then cons_conf_idx_cat_no else cons_conf_idx_cat_yes
    x['cons.conf.idx_cat'] = np.where(x['cons.conf.idx'] <= -35.45,
                                      'cons_conf_idx_cat_no',
                                      'cons_conf_idx_cat_yes')


#convert nr.employed to nr.employed_cat
def convert_nr_employed_cat(x):
    #threshold=5087.65 if t<=5087.65 then nr_employed_cat_yes else nr_employed_cat_no
    x['nr.employed_cat'] = np.where(x['nr.employed'] <= 5087.65,
                                    'nr_employed_cat_yes',
                                    'nr_employed_cat_no')

In [3]:
#given a dataframe this function will perform data preprocessing
def data_preprocessing(data):
    #some col in the dataset have values as unknown we will change it to unknown_col_name

    #Replace the unknown job level
    data['job'].replace('unknown', 'unknown_job', inplace=True)

    #Replace the unknown marital status
    data['marital'].replace('unknown', 'unknown_marital', inplace=True)

    #Replace the unknown education level
    data['education'].replace('unknown', 'unknown_education', inplace=True)

    #Replace the unknown default level
    data['default'].replace('unknown', 'unknown_default', inplace=True)

    #Replace the unknown housing level
    data['housing'].replace('unknown', 'unknown_housing', inplace=True)

    #Replace the unknown loan level
    data['loan'].replace('unknown', 'unknown_loan', inplace=True)

    #Replace the unknown and contact level
    data['contact'].replace('unknown', 'unknown_contact', inplace=True)

    #Replace the unknown and month level
    data['month'].replace('unknown', 'unknown_month', inplace=True)

    #Replace the unknown and day_of_week level
    data['day_of_week'].replace('unknown', 'unknown_day_of_week', inplace=True)

    #Replace the unknown and poutcome level
    data['poutcome'].replace('unknown', 'unknown_poutcome', inplace=True)

    #removing 'hiphen' and 'dot' from the category names
    #cleaning job data
    z = 'job'
    data['job'].replace('blue-collar', 'blue_collar', inplace=True)
    data['job'].replace('self-employed', 'self_employed', inplace=True)

    #cleaning education data
    data['education'].replace('university.degree',
                              'university_degree',
                              inplace=True)
    data['education'].replace('high.school', 'high_school', inplace=True)
    data['education'].replace('basic.9y', 'basic_9y', inplace=True)
    data['education'].replace('professional.course',
                              'professional_course',
                              inplace=True)
    data['education'].replace('basic.4y', 'basic_4y', inplace=True)
    data['education'].replace('basic.6y', 'basic_6y', inplace=True)

    #cleaning default data
    #renaming 'yes' and 'no' in the default names
    data['default'].replace('yes', 'yes_default', inplace=True)
    data['default'].replace('no', 'no_default', inplace=True)

    #cleaning housing data
    #renaming 'yes' and 'no' in the housing names
    data['housing'].replace('yes', 'yes_housing', inplace=True)
    data['housing'].replace('no', 'no_housing', inplace=True)

    #cleaning loan data
    #renaming 'yes' and 'no' in the loan names
    data['loan'].replace('yes', 'yes_loan', inplace=True)
    data['loan'].replace('no', 'no_loan', inplace=True)

    #cleaning poutcome data
    #renaming 'nonexistent', 'failure', 'success' in the poutcome names
    data['poutcome'].replace('nonexistent',
                             'nonexistent_poutcome',
                             inplace=True)
    data['poutcome'].replace('failure', 'failure_poutcome', inplace=True)
    data['poutcome'].replace('success', 'success_poutcome', inplace=True)
    return data

In [4]:
#this function will add the engineered features to the data
def add_engineered_featurs(data):
    #add the categorical features for the cols pdays, cons_price_idx, cons_conf_idx, nr_employed
    convert_pdays_cat(data)
    convert_cons_price_idx_cat(data)
    convert_cons_conf_idx_cat(data)
    convert_nr_employed_cat(data)
    
    #add sine features to numerical data
    cols_num = [
    'age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
    'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'
    ]
    
    #add sine features for numeral data
    for col in cols_num:
        z = col
        z_sin = col + '_sin'
        #get sine of feature and add it to the dataframe
        tmp_sin = return_sin(data[z].values)
        data[z_sin] = tmp_sin
    return data

In [5]:
#This function will standardize the data given a dataframe
#if the data is train data we will have to call to the function with tag = 1
#if the data is test/cv we will have to call the function with tag = 0
#The function will return the the np array after standardization process
#The standard_dict_cat is a dictionary of categorical feature names and their corresponding countvectorizer objects
#The standard_dict_num is a dictionary of numerical feature names and their corresponding standardscalar objects
#if the training data is provided the function will add values to the dectionaries standard_dict_cat,standard_dict_num
# Initially both the dectionaries will be initialized as <'feature_name':0>
# The value will be filled in the train process
# The same dictionaries will be used with the test/cv to transform data


def standardize_data(dataframe, tag, standard_dict_cat, standard_dict_num):
    #if the training data is passed with 0 initialized dectionaries
    if tag == 1:
        #Initialize the feature list
        features = []
        #Initialize the array for hstacking in the end
        array = []
        #convert categorical features to one hot encoding using countvectorizer
        for cat in standard_dict_cat.keys():
            #initialize the vectorizer
            vectorizer = CountVectorizer()
            #fit and transform train data
            X = vectorizer.fit_transform(dataframe[cat])
            #get feature names
            Z = vectorizer.get_feature_names()
            #add vectorizer to the dectionary
            standard_dict_cat[cat] = vectorizer
            #add features to the feature array
            features.extend(Z)
            #add data to the np array
            array.append(X)
        #scale numerical features using standardscalar
        for num in standard_dict_num.keys():
            #initialize the standardscalar
            scaler = StandardScaler()
            #fit and transform train data
            X = scaler.fit_transform(dataframe[num].values.reshape(-1, 1))
            #add scalar to the dectionary
            standard_dict_num[num] = scaler
            #add features to the feature array
            features.append(num)
            #add data to the np array
            array.append(X)

        #preparing np array for returing
        data_stand = sparse.hstack(
            (array[0], array[1], array[2], array[3], array[4], array[5],
             array[6], array[7], array[8], array[9], array[10], array[11],
             array[12], array[13], array[14], array[15], array[16], array[17],
             array[18], array[19], array[20], array[21], array[22], array[23],
             array[24], array[25], array[26], array[27], array[28], array[29],
             array[30], array[31], array[32], array[33]))
        #return data and the features
        return data_stand, features
    if tag == 0:
        #Initialize the feature list
        features = []
        #Initialize the array for hstacking in the end
        array = []
        #convert categorical features to one hot encoding using countvectorizer
        for cat in standard_dict_cat.keys():
            #transform train data
            X = standard_dict_cat[cat].transform(dataframe[cat])
            #get feature names
            Z = standard_dict_cat[cat].get_feature_names()
            #add features to the feature array
            features.extend(Z)
            #add data to the np array
            array.append(X)
        #scale numerical features using standardscalar
        for num in standard_dict_num.keys():
            #transform train data
            X = standard_dict_num[num].transform(dataframe[num].values.reshape(
                -1, 1))
            #add features to the feature array
            features.append(num)
            #add data to the np array
            array.append(X)

        #preparing np array for returing
        data_stand = sparse.hstack(
            (array[0], array[1], array[2], array[3], array[4], array[5],
             array[6], array[7], array[8], array[9], array[10], array[11],
             array[12], array[13], array[14], array[15], array[16], array[17],
             array[18], array[19], array[20], array[21], array[22], array[23],
             array[24], array[25], array[26], array[27], array[28], array[29],
             array[30], array[31], array[32], array[33]))

        #return data and the features
        return data_stand, features
    else:
        return 'Please input a valid tag'

In [6]:
#This function will return the standardization dict 
def get_standardization_dict():
    #read data
    df = pd.read_csv('bank-additional-full.csv', sep=';')
    #drop duplicates
    df.drop_duplicates(inplace=True)
    y_label = df['y']  #get output variable
    le = LabelEncoder()
    output = le.fit_transform(
        y_label)  #transform the output variable to 0/1 form
    df['output'] = output
    Y_True = df['output'].values.reshape(-1,1)
    #remove the 'y' and 'output' columns
    df.drop(['y'], axis=1, inplace=True)
    df.drop(['output'], axis=1, inplace=True)
    #preprocessing data
    data_preprocessing(df)
    #add engineered features
    add_engineered_featurs(df)
    #standardizing the data
    cols_cat = [
        'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
        'month', 'day_of_week', 'poutcome', 'pdays_cat', 'cons.price.idx_cat',
        'cons.conf.idx_cat', 'nr.employed_cat'
    ]

    #numerical features
    cols_num = [
        'age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
        'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
        'age_sin', 'duration_sin', 'campaign_sin', 'pdays_sin', 'previous_sin',
        'emp.var.rate_sin', 'cons.price.idx_sin', 'cons.conf.idx_sin',
        'euribor3m_sin', 'nr.employed_sin'
    ]

    #initializing the standard_dict_cat dectionary
    standard_dict_cat = dict.fromkeys(cols_cat, 0)
    #initializing the standard_dict_num dectionary
    standard_dict_num = dict.fromkeys(cols_num, 0)

    #here we will split our data randomly as there is no temporal nature of the data
    df_Train_temp, df_Test, Y_Train_temp, Y_Test = train_test_split(df, Y_True, stratify=Y_True, test_size=0.3, random_state = 4) 
    df_Train, df_CV, Y_Train, Y_CV = train_test_split(df_Train_temp, Y_Train_temp, stratify=Y_Train_temp, test_size=0.3, random_state = 4)
    #standardize the training data
    X_Train, train_features = standardize_data(df_Train, 1, standard_dict_cat,
                                               standard_dict_num)
    #return the standardization dictionary for test data
    return standard_dict_cat, standard_dict_num

In [7]:
#this function will read the ML Model from the disk 
def return_ml_model():
    #lets load the Model from the disk
    filename = 'finalized_model.sav'
    # load the model from disk
    model = pickle.load(open(filename, 'rb'))
    return model

In [8]:
#this is the final function 1
def final_function_1(data):
    data.drop_duplicates(inplace=True)
    y_label = data['y']  #get output variable
    le = LabelEncoder()
    output = le.fit_transform(
        y_label)  #transform the output variable to 0/1 form
    data['output'] = output
    #remove the 'y' and 'output' columns
    data.drop(['y'], axis=1, inplace=True)
    data.drop(['output'], axis=1, inplace=True)
    #preprocessing data
    data_preprocessing(data)
    #add engineered features
    add_engineered_featurs(data)
    #get the standardization dict
    standard_dict_cat, standard_dict_num = get_standardization_dict()
    #standardizing the data
    x_data, features = standardize_data(data, 0, standard_dict_cat,
                                        standard_dict_num)
    #get model
    model = return_ml_model()
    Y_Predicted = model.predict_proba(x_data)
    Y_Predicted = Y_Predicted[:, -1]
    y_Predicted = np.array(Y_Predicted > 0.5, dtype=int)
    return y_Predicted

In [9]:
#this is the final function 2
def final_function_2(data, y_data):
    #import metric
    from sklearn.metrics import f1_score
    #drop duplicates
    data.drop_duplicates(inplace=True)
    #preprocess data
    data_preprocessing(data)
    #add engineered features
    add_engineered_featurs(data)
    #get the standardization dict
    standard_dict_cat, standard_dict_num = get_standardization_dict()
    #standardizing the data
    x_data, features = standardize_data(data, 0, standard_dict_cat,
                                        standard_dict_num)
    #get model
    model = return_ml_model()
    Y_Predicted = model.predict_proba(x_data)
    Y_Predicted = Y_Predicted[:, -1]
    y_Predicted = np.array(Y_Predicted > 0.5, dtype=int)
    #get f1-score
    f1_score = f1_score(y_data, y_Predicted, average='weighted')
    return f1_score, y_Predicted

## Final call to ML pipeline

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from itertools import combinations
import random
from scipy import sparse
from scipy.sparse import vstack

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import pickle

#read data
dff = pd.read_csv('bank-additional-full.csv', sep=';')
#drop duplicates
dff.drop_duplicates(inplace=True)
#sample data from the df
data = dff.sample(n=100)
#get output variable
y_label = data['y']
le = LabelEncoder()
#transform the output variable to 0/1 form
output = le.fit_transform(y_label)
data['output'] = output
y_data = data['output'].values.reshape(-1, 1)
#test final_fun_1
y_Predicted = final_function_1(data)
f1_score = f1_score(y_data, y_Predicted, average='weighted')
print('The f1-score is : ', f1_score)

#test final_fun_2
f1_score, y_Predicted = final_function_2(data, y_data)
print('The f1-score is : ', f1_score)

The f1-score is :  0.9623920265780731
The f1-score is :  0.9623920265780731


## Conclusions

- In this notebook we Implemented the ML pipeline for the Bank Marketing Dataset
- The Model that we used is the best Model from the experimented Models

## References

- Applied AI Course
- https://www.kaggle.com/nextbigwhat/eda-for-categorical-variables-a-beginner-s-way
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html
- https://seaborn.pydata.org/generated/seaborn.pairplot.html
- https://www.geeksforgeeks.org/permutation-and-combination-in-python/
- https://seaborn.pydata.org/examples/distplot_options.html
- https://stackoverflow.com/questions/14770735/how-do-i-change-the-figure-size-with-subplots
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
- https://towardsdatascience.com/how-to-visualize-a-decision-tree-in-5-steps-19781b28ffe2
- https://towardsdatascience.com/visualizing-decision-trees-with-python-scikit-learn-graphviz-matplotlib-1c50b4aa68dc
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html
- https://stackoverflow.com/questions/46659073/change-numerical-data-to-categorical-data-pandas
- https://thispointer.com/python-how-to-use-if-else-elif-in-lambda-functions/
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.apply.html
- https://stackoverflow.com/questions/13331698/how-to-apply-a-function-to-two-columns-of-pandas-dataframe
- https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
- https://towardsdatascience.com/normalization-vs-standardization-quantitative-analysis-a91e8a79cebf
- https://www.kaggle.com/discdiver/guide-to-scaling-and-standardizing
- https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
- https://sebastianraschka.com/Articles/2014_about_feature_scaling.html
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
- https://stackoverflow.com/questions/52410880/is-there-a-way-i-can-initialize-dictionary-values-to-0-in-python-taking-keys-fro
- https://stackoverflow.com/questions/22257836/numpy-hstack-valueerror-all-the-input-arrays-must-have-same-number-of-dimens
- https://stackoverflow.com/questions/26576524/how-do-i-transform-a-scipy-sparse-matrix-to-a-numpy-matrix
- https://docs.scipy.org/doc/numpy/reference/generated/numpy.hstack.html
- https://www.geeksforgeeks.org/append-extend-python/
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
- https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
- https://scikit-learn.org/stable/modules/naive_bayes.html#naive-bayes
- https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/keras/Input
- https://machinelearningmastery.com/keras-functional-api-deep-learning/
- https://github.com/pranaysawant/Zomato-Restaurant-Rate-Prediction/blob/master/Zomato%20Restaurant%20Rating%20Prediction.ipynb
- https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras
- https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
- https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/
- http://rasbt.github.io/mlxtend/api_subpackages/mlxtend.classifier/