In [185]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

import matplotlib.pyplot as plt

In [186]:
import warnings
warnings.filterwarnings('ignore')

In [211]:
def cv_evaluator(model, x, y, folds):
    # Returns mean accuracy of n_folds cross validation
    mean_accuracy = cross_val_score(
        model, 
        x, 
        y, 
        cv=folds,
        n_jobs=1,
        scoring = 'accuracy'
    ).mean()
    
    return mean_accuracy

In [188]:
df = pd.read_csv('../data/bank-additional-full.csv', delimiter = ';')
df['y_vals'] = df['y'].apply(lambda x: 1 if x=='yes' else 0)
df.drop(columns='duration', inplace=True)

In [189]:
df.columns = [ c.replace('.','_') for c in df.columns]

In [190]:
categorical = ['job','marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

In [191]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx',
       'euribor3m', 'nr_employed', 'y', 'y_vals'],
      dtype='object')

In [192]:
y_vals = df['y'].apply(lambda x: 1 if x=='yes' else 0).values

In [193]:
for cat in categorical:
    df = pd.concat([df, pd.get_dummies(df[cat], prefix=cat, drop_first=True)], axis=1) 

df.drop(columns=categorical, inplace=True)

In [194]:
cols = df.drop(columns=['y', 'y_vals']).columns

In [195]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(columns=['y','y_vals']), 
    y_vals, 
    random_state=20191102
)
folds = KFold(n_splits=5, shuffle=True, random_state=20191102)

In [196]:
x_train.columns

Index(['age', 'campaign', 'pdays', 'previous', 'emp_var_rate',
       'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'marital_unknown',
       'education_basic.6y', 'education_basic.9y', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'default_unknown',
       'default_yes', 'housing_unknown', 'housing_yes', 'loan_unknown',
       'loan_yes', 'contact_telephone', 'month_aug', 'month_dec', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
       'day_of_week_wed', 'poutcome_nonexistent', 'poutcome_success'],
  

In [197]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [198]:
# Export Scaler
joblib.dump(scaler,  '../MLModels/scaler.joblib')

['../MLModels/scaler.joblib']

## Logistic Regression

In [199]:
log_model = LogisticRegression() 
log_model.fit(x_train, y_train)
prediction = log_model.predict(x_test)

In [200]:
joblib.dump(log_model, '../MLModels/logistic_regression.joblib')

['../MLModels/logistic_regression.joblib']

In [201]:
print(confusion_matrix(y_test, prediction)/len(prediction)*100)

[[87.89938817  1.31106147]
 [ 8.3131009   2.47644945]]


In [213]:
print(cv_evaluator(log_model, x_train, y_train, folds=folds))

0.8979313594207472


## Random Forests

In [202]:
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(x_train, y_train)
prediction = rf_model.predict(x_test)

In [203]:
joblib.dump(rf_model, '../MLModels/random_forests.joblib')

['../MLModels/random_forests.joblib']

In [204]:
print(confusion_matrix(y_test, prediction)/len(prediction)*100)

[[86.49121103  2.71923861]
 [ 7.60415655  3.1853938 ]]


In [214]:
print(cv_evaluator(rf_model, x_train, y_train, folds=folds))

0.8901944896222446


#### Feature Importance using RF

In [215]:
feature_importance = sorted(
    zip(
        map(lambda x: round(x, 4), 
            rf_model.feature_importances_
           ), cols), 
             reverse=True
)

In [216]:
# Top 10 important features
for f in feature_importance[:10]: print(f)

(0.1739, 'age')
(0.1307, 'euribor3m')
(0.0868, 'campaign')
(0.0442, 'nr_employed')
(0.039, 'housing_yes')
(0.0344, 'pdays')
(0.0268, 'cons_conf_idx')
(0.0251, 'loan_yes')
(0.025, 'emp_var_rate')
(0.0241, 'poutcome_success')


### XGBoost

In [217]:
xgb_model = XGBClassifier()
xgb_model.fit(x_train,y_train)
prediction = xgb_model.predict(x_test)

In [218]:
joblib.dump(xgb_model, '../MLModels/xgb.joblib')

['../MLModels/xgb.joblib']

In [219]:
print(confusion_matrix(y_test, prediction)/len(prediction)*100)

[[87.89938817  1.31106147]
 [ 8.24511994  2.54443042]]


In [220]:
print(cv_evaluator(xgb_model, x_train, y_train, folds=folds))

0.8992586288492372
