In [1]:
# saves you having to use print as all exposed variables are printed in the cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# suppress warning message
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning)

## Business Objective
### Churn prediction

Predict if a bank's customers is likely to leave or not

### Setup

In [3]:
import os
from ds_capability import FeatureSelect, FeatureEngineer, FeatureTransform, AutoML, Controller

In [4]:
os.environ['HADRON_PM_PATH'] = './hadron/churn'

os.environ['HADRON_CHURN_SOURCE_PATH'] = 'https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/datasets/toy_sample/churn.csv'
os.environ['HADRON_CHURN_PERSIST_PATH'] = './hadron/data/hadron_docs_churn_predict.parquet'

## Exploratory Data Analysis

In [5]:
fs = FeatureSelect.from_memory()

tbl = fs.set_source_uri('${HADRON_CHURN_SOURCE_PATH}').load_source_canonical()

In [6]:
fs.canonical_report(tbl)

Unnamed: 0,Attributes,DataType,Nulls,Dominate,Valid,Unique,Observations
0,RowNumber,int64,0.0%,0.0%,10000,10000,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15..."
1,CustomerId,int64,0.0%,0.0%,10000,10000,"[15634602, 15647311, 15619304, 15701354, 15737888,..."
2,Surname,string,0.0%,0.3%,10000,2932,"['Smith', 'Scott', 'Martin', 'Walker', 'Brown', 'S..."
3,CreditScore,int64,0.0%,2.3%,10000,460,"[850, 678, 655, 667, 705, 684, 670, 651, 660, 652,..."
4,Geography,string,0.0%,50.1%,10000,3,"['France', 'Germany', 'Spain']"
5,Gender,string,0.0%,54.6%,10000,2,"['Male', 'Female']"
6,Age,int64,0.0%,4.8%,10000,70,"[37, 38, 35, 36, 34, 33, 40, 39, 32, 31, 41, 29, 3..."
7,Tenure,int64,0.0%,10.5%,10000,11,"[2, 1, 7, 8, 5, 3, 4, 9, 6, 10, 0]"
8,Balance,double,0.0%,36.2%,10000,6382,"[0.0, 130170.82, 105473.74, 83807.86, 159660.8, 12..."
9,NumOfProducts,int64,0.0%,50.8%,10000,4,"[1, 2, 3, 4]"


## Preprocessing

### Feature Selection

In [7]:
fs = FeatureSelect.from_env('churn', has_contract=False)

fs.set_source_uri('${HADRON_CHURN_SOURCE_PATH}')
fs.set_persist_uri('event://select')

# extract label
fs.add_connector_uri('label', uri='event://label')

tbl = fs.load_source_canonical()

<ds_capability.components.feature_select.FeatureSelect at 0x7fd4e291b790>

<ds_capability.components.feature_select.FeatureSelect at 0x7fd4e291b790>

<ds_capability.components.feature_select.FeatureSelect at 0x7fd4e291b790>

In [8]:
# label
label = fs.tools.auto_drop_columns(tbl, headers=['Exited'], drop=True)
fs.save_canonical('label', label)

In [9]:
tbl = fs.tools.auto_drop_columns(tbl, headers=['Surname', 'RowNumber', 'Exited'])

In [10]:
fs.run_component_pipeline()

### Feature Transformation

In [11]:
ft = FeatureTransform.from_env('churn', has_contract=False)

ft.set_source_uri('event://select')
ft.set_persist_uri('event://transform')

tbl = ft.load_source_canonical()

<ds_capability.components.feature_transform.FeatureTransform at 0x7fd4e492ebc0>

<ds_capability.components.feature_transform.FeatureTransform at 0x7fd4e492ebc0>

#### Encode

In [12]:
# robust encode to negate outliers
tbl = ft.tools.encode_category_one_hot(tbl, headers=['Gender', 'Geography'], drop_first=True)

#### Discretize

In [13]:
tbl = ft.tools.discrete_quantiles(tbl, header='CreditScore', interval=6, categories=False, to_header='DiscreteCredit')
tbl = ft.tools.discrete_quantiles(tbl, header='Age', interval=8, categories=False, to_header='DiscreteAge')
tbl = ft.tools.discrete_quantiles(tbl, header='EstimatedSalary', interval=10, categories=False, to_header='DiscreteSalary')

# sparse data so rank values to negate predominance of zero's
tbl = ft.tools.discrete_quantiles(tbl, header='Balance', interval=5, categories=False, duplicates='rank', to_header='DiscreteBalance')

#### Scale

In [14]:
# hypothesis: customers that are older have better tenure
tbl = ft.tools.scale_mapping(tbl, 'Tenure', 'Age', to_header='TenureAge')

tbl = ft.tools.scale_normalize(tbl, scalar='robust', headers=['CreditScore','Age','Balance','EstimatedSalary','TenureAge'])

In [15]:
ft.run_component_pipeline()

## Model Discovery

In [16]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression  
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score,recall_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBClassifier

In [17]:
aml = AutoML.from_env('churn', has_contract=False)

aml.set_source_uri('event://transform')
aml.set_persist_uri('event://automl')

tbl = aml.load_source_canonical()
aml.save_persist_canonical(tbl)

<ds_capability.components.automl.AutoML at 0x7fd4e4993100>

<ds_capability.components.automl.AutoML at 0x7fd4e4993100>

In [18]:
X = np.asarray(tbl.drop_columns('CustomerId'))
y = np.asarray(label)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [19]:
models = [('LR', LogisticRegression(random_state=123456)),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier(random_state=123456)),
          ('RF', RandomForestClassifier(random_state=123456)),
          ('SVR', SVC(gamma='auto',random_state=123456)),
          ('XGB', XGBClassifier(random_state=123456)),
          ('GB', GradientBoostingClassifier(random_state=123456))]
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X, y, cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.811500 (0.009405)
KNN: 0.831800 (0.010352)
CART: 0.785900 (0.008904)
RF: 0.863800 (0.009867)
SVR: 0.851200 (0.008085)
XGB: 0.852600 (0.008345)
GB: 0.864600 (0.009457)


In [20]:
# GB Confusion Matrix
model_GB = GradientBoostingClassifier(random_state=12345)
model_GB.fit(X_train, y_train)
y_pred = model_GB.predict(X_test)
conf_mat = confusion_matrix(y_pred,y_test)
conf_mat

array([[769, 100],
       [ 30, 101]])

In [21]:
print("True Positive : ", conf_mat[1, 1])
print("True Negative : ", conf_mat[0, 0])
print("False Positive: ", conf_mat[0, 1])
print("False Negative: ", conf_mat[1, 0])

True Positive :  101
True Negative :  769
False Positive:  100
False Negative:  30


In [22]:
# Classification Report for XGB Model
print(classification_report(model_GB.predict(X_test),y_test))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92       869
           1       0.50      0.77      0.61       131

    accuracy                           0.87      1000
   macro avg       0.73      0.83      0.77      1000
weighted avg       0.90      0.87      0.88      1000



## Classifier Predict

In [23]:
# reset the connectors
aml.set_source_uri('event://transform')
aml.set_persist_uri('${HADRON_CHURN_PERSIST_PATH}')

tbl = aml.load_source_canonical()

<ds_capability.components.automl.AutoML at 0x7fd4e4993100>

<ds_capability.components.automl.AutoML at 0x7fd4e4993100>

In [24]:
aml.add_trained_model(model_name='GradientBoost', trained_model=model_GB)

In [25]:
predict = aml.tools.label_predict(tbl, model_name='GradientBoost', id_header='CustomerId')

In [26]:
aml.run_component_pipeline()

## Controller

In [27]:
ctrl = Controller.from_env(has_contract=False)

In [28]:
ctrl.register.feature_select('churn')
ctrl.register.feature_transform('churn')
ctrl.register.automl('churn')

(10000, 11)

(10000, 14)

(10000, 2)

In [29]:
ctrl.run_controller()

### Review Run

In [30]:
AutoML.from_env('churn').load_persist_canonical()

pyarrow.Table
CustomerId: int64
predict: int64
----
CustomerId: [[15634602,15647311,15619304,15701354,15737888,...,15606229,15569892,15584532,15682355,15628319]]
predict: [[0,0,1,0,0,...,0,0,0,0,0]]