In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("../input/credit-card-customers/BankChurners.csv")
data.drop(['CLIENTNUM',
                 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 
                 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'],
                 axis =1, inplace = True)

# Let's go throgh the insight of DataFrame

In [None]:
### Check the distribution of the target variable

((data.Attrition_Flag.value_counts())/data.shape[0])*100

In [None]:
## Check for null Value
print(data.isnull().sum())
data.info()

In [None]:
y = data.Attrition_Flag
X = data.drop('Attrition_Flag', axis = 1)

In [None]:
X.Customer_Age.value_counts().plot(kind='barh')

In [None]:
object_columns = X.loc[:, X.dtypes == "object" ].columns
float_columns = X.loc[:, X.dtypes != "object" ].columns

In [None]:
X[float_columns]

# Multicollinearity (VIF)

Although correlation matrix and scatter plots can also be used to find multicollinearity, their findings only show the bivariate relationship between the independent variables. VIF is preferred as it can show the correlation of a variable with a group of other variables.


Multicollinearity can be detected via various methods. In this article, we will focus on the most common one – VIF (Variable Inflation Factors)


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
vif = calc_vif(X[float_columns]).sort_values(by=['VIF'])
vif

We can see here that the ‘Avg_Open_To_Buy’,'Total_Revolving_Bal', 'Credit_Limit', 'Customer_Age' and ‘Months_on_book’ have a high VIF value, meaning they can be predicted by other independent variables in the dataset. So we can drop that columns from our dataset.


In [None]:
remove_column = vif.iloc[-5:,0].to_list()
remove_column

In [None]:
X = X.drop(remove_column, axis = 1)

In [None]:
print(X.shape, y.shape)

# One Hot encoding for the object or categorical columns

In [None]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(drop = 'first')
enc.fit(X[object_columns])
d = enc.transform(X[object_columns])

Creating a dataframe with all the data of onehot encoder and merging with our old dataframe

In [None]:
enc.get_feature_names(object_columns)
OHE = pd.DataFrame(d.todense())
OHE.columns = enc.get_feature_names(object_columns)
X = X.drop(object_columns, axis = 1)

In [None]:
X_feature = pd.concat([X, OHE], axis=1)

# Label Encoding for our target columns

In [None]:
import category_encoders as ce

encoder= ce.OrdinalEncoder(cols=['Attrition_Flag'],return_df=True,
                           mapping=[{'col':'Attrition_Flag',
'mapping':{'Existing Customer':0,'Attrited Customer':1}}])

y = data.Attrition_Flag

y_target = encoder.fit_transform(y)


In [None]:
print(y_target.shape, X_feature.shape)

# Building Model and hyperparameter tunning (Xgboosting Algorithm)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


X_train, X_test, y_train, y_test = train_test_split(X_feature, y_target, test_size=0.20, random_state=42)

xgb_model = XGBClassifier().fit(X_train, y_train)

# predict
xgb_y_predict = xgb_model.predict(X_test)

# accuracy score
xgb_score = accuracy_score(xgb_y_predict, y_test)

print('Accuracy score is:', xgb_score)
target_names = ['class 0', 'class 1']
print(classification_report(xgb_y_predict, y_test, target_names=target_names))
print(confusion_matrix(xgb_y_predict, y_test))
roc_auc_score(xgb_y_predict, y_test)

## Hyperparameter tunning for Xgboost algorithm

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

params = {
   "learning_rate" : [0.05, 0.10, 0.15, 0.20,0.25],
    "max_depth" : [3,4,5,6,7,8,9,12, 15],
    "min_child_weight" : [1,2,3,4,5,6,7],
    "gamma" :[-0.5,0,0.5,1,1.5,2],
    "colsample_bytree" : [0.3,0.5,0.7,0.8,1,1.2, 1.3]
}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_feature, y_target, test_size=0.20, random_state=42)

xgb_model = XGBClassifier()

random_search = RandomizedSearchCV(xgb_model, param_distributions = params, n_iter = 5, scoring ="roc_auc", n_jobs=-1, cv = 5, verbose = 3)

random_search.fit(X_train, y_train)

In [None]:
random_search.best_estimator_

In [None]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=7,
              min_child_weight=4,  monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
xgb_model = xgb.fit(X_train, y_train)

# predict
xgb_y_predict = xgb_model.predict(X_test)

# accuracy score
xgb_score = accuracy_score(xgb_y_predict, y_test)

print('Accuracy score is:', xgb_score)
target_names = ['class 0', 'class 1']
print(classification_report(xgb_y_predict, y_test, target_names=target_names))
print(confusion_matrix(xgb_y_predict, y_test))
roc_auc_score(xgb_y_predict, y_test)

# Imbalanced Data 
When observation in one class is higher than the observation in other classes then there exists a class imbalance. 

### Over sampling technique to solve the imbalance class 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_feature, y_target, test_size=0.20, random_state=42)

from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ros = RandomOverSampler(random_state=42)

# fit predictor and target 
variablex_ros, y_ros = ros.fit_resample(X_train.to_numpy(), y_train.to_numpy())

print('Resample dataset shape', Counter(y_ros))

In [None]:
xgb_model = xgb.fit(variablex_ros, y_ros)

# predict
xgb_y_predict = xgb_model.predict(X_test.to_numpy())

# accuracy score
xgb_score = accuracy_score(xgb_y_predict, y_test.to_numpy().reshape(-1,))

print('Accuracy score is:', xgb_score)
target_names = ['class 0', 'class 1']
print(classification_report(xgb_y_predict, y_test, target_names=target_names))
print(confusion_matrix(xgb_y_predict, y_test))
print(roc_auc_score(xgb_y_predict, y_test))

### SMOTE technique to solve the imbalance class 

In [None]:
# X_train, X_test, y_train, y_test
# import library
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X_train, y_train)

print('Original dataset shape', y_train.value_counts())
print('Resample dataset shape', y_smote.value_counts())

In [None]:
xgb_model = xgb.fit(x_smote, y_smote)

# predict
xgb_y_predict = xgb_model.predict(X_test)

# accuracy score
xgb_score = accuracy_score(xgb_y_predict, y_test)

print('Accuracy score is:', xgb_score)
target_names = ['class 0', 'class 1']
print(classification_report(xgb_y_predict, y_test, target_names=target_names))
print(confusion_matrix(xgb_y_predict, y_test))
print(roc_auc_score(xgb_y_predict, y_test))