In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
# from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline

# Data

In [None]:
df = pd.read_csv('Churn Modeling.csv')

### Data types to consider for encoding methods and which columns are unnecessary

In [None]:
df.dtypes

In [None]:
df.head()

Based on these available
- RowNumber is not needed
- CustomerId not needed for prediction 
- Surname should not matter for prediction
- Unsure if Gender and Age can be used for prediction purpose due to compliance (assuming at least gender can't be used)

### Check counts with describe for nulls of numerical values

In [None]:
df.describe()
#also checking if the ranges of values make sense or not to business

In [None]:
df.info()

In [None]:
print(df['Geography'].count())
df['Geography'].value_counts()

There is only 3 missing value credit score, basic quality satisfied

### Check for correlations for multicolinearity and data leakage issue

In [None]:
df.corr()

In [None]:
y, X = dmatrices('Exited ~ CreditScore+Age+Tenure+Balance+NumOfProducts+HasCrCard+IsActiveMember+EstimatedSalary', data=df, return_type='dataframe')


In [None]:
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['variable'] = X.columns
vif

There doesn't seem to be a leakage with a feature super correlated to Exited label especially IsActiveMember, which I thought would be highly correlated, but don't know if that is only because of nature of mock data. The VIF doesn't indicate there is multicolinearity issue

### Check for potential imbalanced data issue

In [None]:
print(df['Exited'].value_counts()[1]/df['Exited'].value_counts()[0])
df['Exited'].value_counts()

Although ideally it would have 1:1 but 1:4 is still relatively okay ratio and can use SMOTE and ADASYN

# Features
There are no time components so no time variant features

In [None]:
df.groupby(['Geography', 'Exited'])['Balance'].mean()

In [None]:
# df = pd.concat([df, pd.get_dummies(df['Geography'])], axis = 1)
df['Geography'] = df['Geography'].apply(lambda x: 0 if x == 'France' else (1 if x == 'Spain' else 2))
df['Gender'] = df['Gender'].apply(lambda x: 0 if x == 'Male' else 1)
df

In [None]:
df['Bal_Prod'] = df['Balance']/df['NumOfProducts']
df['Ten2Age'] = df['Tenure']/df['Age']
df['Sal2Bal'] = df['Balance']/df['EstimatedSalary']

### Preprocessing

Drop columns, and split dataset

In [None]:
X = df.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis = 1)
y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)


In [None]:
print(y_train.value_counts())
print(y_test.value_counts())

Define preprocessing functions to be put into pipeline

In [None]:
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTEENN 
from imblearn.under_sampling import ClusterCentroids 
adasyn = SMOTEENN(sampling_strategy = 0.5, random_state=0)
# adasyn = ClusterCentroids(random_state=0)

imputer = KNNImputer()

# Models
Start with the most basic Logistic and RandomForest, low dimension data doesn't really need dim reduction

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn import svm

rf_param_grid = {
#     "adasyn__sampling_strategy": [0.4, 0.5],
#     "adasyn__n_neighbors": [4, 5, 6],
    "classifier__max_depth": [6],
    "classifier__n_estimators": [80],
    "classifier__learning_rate": [0.05],
    "classifier__gamma": [0.1]
    }
rf_classifier = xgb.XGBClassifier(max_depth= 6, n_estimators = 80, eval_metric=["error", 'logloss'],
                                           learning_rate = 0.05, gamma=0.1, random_state=0, 
                                  use_label_encoder = False)


Define pipeline 

In [None]:
from sklearn.preprocessing import RobustScaler


In [None]:
pipe2 = Pipeline(steps = [
    ('scaler', RobustScaler()),
    ('impute', imputer), 
    ('adasyn', adasyn),
    ('classifier', rf_classifier)
                        ]
               )

In [None]:
mod2 = GridSearchCV(pipe2, rf_param_grid, cv = 5, 
                    refit = 'recall', 
                    scoring='recall'
                   )


In [None]:
print(mod2.fit(X_train, y_train))

In [None]:
mod2.best_estimator_
print(mod2.best_estimator_.get_params()['adasyn'])
mod2.best_estimator_.get_params()['classifier']

In [None]:
from sklearn.metrics import f1_score, accuracy_score

print('recall')
print(mod2.score(X_test, y_test))

print('accuracy')
print(accuracy_score(y_test, mod2.predict(X_test)))

print(f1_score(y_train, mod2.predict(X_train)))
print(f1_score(y_test, mod2.predict(X_test)))
#0.6265060240963856

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, mod2.predict(X_train)))
print(confusion_matrix(y_test, mod2.predict(X_test)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, mod2.predict(X_test), output_dict = True))


In [None]:
mod2.cv_results_['mean_test_score']

In [None]:
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    mod2.best_estimator_.get_params()['classifier'], X_test, y_test, name="classifier"
)
_ = display.ax_.set_title("2-class Precision-Recall curve")


In [None]:
display = PrecisionRecallDisplay.from_estimator(
    mod2.best_estimator_.get_params()['classifier'], X_train, y_train, name="classifier"
)
_ = display.ax_.set_title("2-class Precision-Recall curve")


In [None]:
feat_importance = pd.DataFrame(mod2.best_estimator_.named_steps['classifier'].feature_importances_).T
feat_importance.columns = X_train.columns
feat_importance

In [None]:
# train a model with single tree
import numpy as np
Xd = xgb.DMatrix(X_train, label=y_train)
Xt = xgb.DMatrix(X_test, label=y_test)

params = {
    "eta": 0.5,
    "max_depth": 7,
    "objective": "binary:logistic",
    "silent": 1,
    "base_score": np.mean(y_train),
    "eval_metric": ["logloss", "error"],
    "learning_rate": 0.3,
    "sampling_method": 'gradient_based'
}

model = xgb.train(params, Xd, 1, 
                  [(Xd, "train"), (Xt, "valid")], early_stopping_rounds=5
                 )
print("Model error =", np.linalg.norm(y_train-model.predict(Xd)))
print(model.get_dump(with_stats=True)[0])

In [None]:
import shap
pred = model.predict(Xd, output_margin=True)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Xd)
np.abs(shap_values.sum(1) + explainer.expected_value - pred).max()

In [None]:
shap.summary_plot(shap_values, X_train)
