In [76]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib

import matplotlib.pyplot as plt

In [77]:
import warnings
warnings.filterwarnings('ignore')

In [144]:
def cv_evaluator(model, x, y, folds):
    # Returns mean accuracy of n_folds cross validation
    mean_accuracy = cross_val_score(
        model, 
        x, 
        y, 
        cv=folds,
        n_jobs=1,
        scoring = 'accuracy'
    ).mean()
    
    return mean_accuracy

In [145]:
df = pd.read_csv('../data/bank-additional-full.csv', delimiter = ';')
df['y_vals'] = df['y'].apply(lambda x: 1 if x=='yes' else 0)
df.drop(columns='duration', inplace=True)

In [146]:
df.columns = [ c.replace('.','_') for c in df.columns]

In [147]:
categorical = ['job','marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

In [148]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx',
       'euribor3m', 'nr_employed', 'y', 'y_vals'],
      dtype='object')

In [149]:
y_vals = df['y'].apply(lambda x: 1 if x=='yes' else 0).values

In [150]:
for cat in categorical:
    df = pd.concat([df, pd.get_dummies(df[cat], prefix=cat, drop_first=True)], axis=1) 

df.drop(columns=categorical, inplace=True)

In [143]:
cols = df.drop(columns=['y']).columns

In [151]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(columns=['y']), 
    y_vals, 
    random_state=20191102
)
folds = KFold(n_splits=5, shuffle=True, random_state=20191102)

In [152]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [153]:
# Export Scaler
joblib.dump(scaler,  '../MLModels/scaler.joblib')

['../MLModels/scaler.joblib']

## Logistic Regression

In [138]:
log_model = LogisticRegression() 
log_model.fit(x_train, y_train)
prediction = log_model.predict(x_test)

In [139]:
joblib.dump(log_model, '../MLModels/logistic_regression.joblib')

['../MLModels/logistic_regression.joblib']

In [91]:
print(confusion_matrix(y_test, prediction)/len(prediction)*100)

[[89.21044965  0.        ]
 [ 0.         10.78955035]]


## Random Forests

In [92]:
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(x_train, y_train)
prediction = rf_model.predict(x_test)

In [93]:
joblib.dump(rf_model, '../MLModels/random_forests.joblib')

['../MLModels/random_forests.joblib']

In [95]:
print(confusion_matrix(y_test, prediction)/len(prediction)*100)

[[89.21044965  0.        ]
 [ 0.         10.78955035]]


#### Feature Importance using RF

In [96]:
feature_importance = sorted(
    zip(
        map(lambda x: round(x, 4), 
            rf_model.feature_importances_
           ), cols), 
             reverse=True
)

In [97]:
# Top 10 important features
for f in feature_importance[:10]: print(f)

(0.7764, 'y_vals')
(0.0375, 'euribor3m')
(0.0353, 'nr.employed')
(0.0201, 'pdays')
(0.0174, 'age')
(0.0146, 'poutcome_success')
(0.0137, 'emp.var.rate')
(0.0124, 'cons.conf.idx')
(0.008, 'cons.price.idx')
(0.0071, 'campaign')


### XGBoost

In [98]:
xgb_model = XGBClassifier()
xgb_model.fit(x_train,y_train)
prediction = xgb_model.predict(x_test)

In [99]:
joblib.dump(xgb_model, '../MLModels/xgb.joblib')

['../MLModels/xgb.joblib']

In [102]:
print(confusion_matrix(y_test, prediction)/len(prediction)*100)

[[89.21044965  0.        ]
 [ 0.         10.78955035]]
