## Import packages

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc

## Read data

In [None]:
df_original = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
df_original.head()

The last two columns are dropped, because they have been added by the author of this dataset.

Column 'CLIENTNUM' referring to the client number is also dropped.

In [None]:
df = df_original.iloc[:, :-2]
df.drop(['CLIENTNUM'], axis=1, inplace=True)
df.head(4)

There are no empty data:

In [None]:
df.isna().sum().sum()

Inspecting each column.

In [None]:
col_names = df.columns.tolist()
print(col_names)

In [None]:
df['Attrition_Flag'].value_counts()

In [None]:
df['Customer_Age'].describe()

In [None]:
df['Gender'].value_counts()

In [None]:
df['Dependent_count'].value_counts()

In [None]:
df['Education_Level'].value_counts()

In [None]:
df['Marital_Status'].value_counts()

In [None]:
df['Income_Category'].value_counts()

In [None]:
df['Card_Category'].value_counts()

In [None]:
df['Months_on_book'].describe()

In [None]:
df['Total_Relationship_Count'].value_counts()

In [None]:
df['Months_Inactive_12_mon'].value_counts()

In [None]:
df['Contacts_Count_12_mon'].value_counts()

In [None]:
df['Credit_Limit'].describe()

In [None]:
df['Total_Revolving_Bal'].describe()

In [None]:
df['Avg_Open_To_Buy'].describe()

In [None]:
df['Total_Amt_Chng_Q4_Q1'].describe()

In [None]:
df['Total_Trans_Amt'].describe()

In [None]:
df['Total_Trans_Ct'].describe()

In [None]:
df['Total_Ct_Chng_Q4_Q1'].describe()

In [None]:
df['Avg_Utilization_Ratio'].describe()

## Decision Tree Classifier

Columns that contain categorical data.

In [None]:
cat_col_names = df.columns[df.dtypes==object].tolist()
cat_col_names

Encoding categorical data

In [None]:
label_enc = LabelEncoder()
df[cat_col_names] = df[cat_col_names].apply(
    lambda col: label_enc.fit_transform(col))

In [None]:
x = df.drop(['Attrition_Flag'], axis=1)
y = df['Attrition_Flag']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=0)

Decision tree classifier with default parameters.

In [None]:
model_DecTreeClass = DecisionTreeClassifier(random_state=0)
model_DecTreeClass.fit(x_train, y_train)
y_pred_DecTreeClass = model_DecTreeClass.predict(x_test)

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_DecTreeClass)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC: %.2f' % (roc_auc*100), "%")

List of features sorted by its importance.

In [None]:
feat_imp_dic = dict(zip(x.columns, model_DecTreeClass.feature_importances_))
feat_imp_sorted = sorted(feat_imp_dic.items(), key=lambda x: x[1], reverse=True)
pd.DataFrame(feat_imp_sorted, columns=['Feature', 'Feature importance'])

## Hyperparameter Tuning

In [None]:
def train_test_scores(x_train, x_test, y_train, y_test, model):    
    model.fit(x_train, y_train)      
    fp_train, tp_train, _ = roc_curve(y_train, model.predict(x_train))
    fp_pred, tp_pred, _ = roc_curve(y_test, model.predict(x_test))    
    return auc(fp_train, tp_train), auc(fp_pred, tp_pred)

In [None]:
max_depths = range(1,31) # max_depth: 1,2,...,30

train_results = []
test_results = []
for max_depth in max_depths:
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=0)
    auc_train, auc_pred = train_test_scores(x_train, x_test, y_train, y_test, model)
    train_results.append(auc_train)
    test_results.append(auc_pred)

line1, = plt.plot(max_depths, train_results, 'b', label='Train AUC')
line2, = plt.plot(max_depths, test_results, 'r', label='Test AUC')
plt.legend()
plt.xlabel('Tree depth')
plt.ylabel('AUC score')
plt.title('Impurity measure - Gini')
plt.show()

In [None]:
max_depths = range(1,31) # max_depth: 1,2,...,30

train_results = []
test_results = []
for max_depth in max_depths:
    model = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0)
    auc_train, auc_pred = train_test_scores(x_train, x_test, y_train, y_test, model)
    train_results.append(auc_train)
    test_results.append(auc_pred)

line1, = plt.plot(max_depths, train_results, 'b', label='Train AUC')
line2, = plt.plot(max_depths, test_results, 'r', label='Test AUC')
plt.legend()
plt.xlabel('Tree depth')
plt.ylabel('AUC score')
plt.title('Impurity measure - Entropy')
plt.show()

In [None]:
max_features_list = range(1,20)

train_results = []
test_results = []
for max_features in max_features_list:
    model = DecisionTreeClassifier(max_features=max_features, random_state=0)
    auc_train, auc_pred = train_test_scores(x_train, x_test, y_train, y_test, model)
    train_results.append(auc_train)
    test_results.append(auc_pred)

line1, = plt.plot(max_features_list, train_results, 'b', label='Train AUC')
line2, = plt.plot(max_features_list, test_results, 'r', label='Test AUC')
plt.legend()
plt.xlabel('Number of used features')
plt.xticks(max_features_list)
plt.ylabel('AUC score')
plt.show()


In [None]:
min_samples_splits = np.linspace(0.05, 1.0, num=100, endpoint=True)

train_results = []
test_results = []
for min_samples_split in min_samples_splits:
    model = DecisionTreeClassifier(min_samples_split=min_samples_split, random_state=0)    
    auc_train, auc_pred = train_test_scores(x_train, x_test, y_train, y_test, model)
    train_results.append(auc_train)
    test_results.append(auc_pred)

line1, = plt.plot(min_samples_splits, train_results, 'b', label='Train AUC')
line2, = plt.plot(min_samples_splits, test_results, 'r', label='Test AUC')
plt.legend()
plt.xlabel('min samples splits')
plt.ylabel('AUC score')
plt.show()

In [None]:
min_samples_leafs = np.linspace(0.05, 0.5, num=100, endpoint=True)

train_results = []
test_results = []
for min_samples_leaf in min_samples_leafs:
    model = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, random_state=0)    
    auc_train, auc_pred = train_test_scores(x_train, x_test, y_train, y_test, model)
    train_results.append(auc_train)
    test_results.append(auc_pred)

line1, = plt.plot(min_samples_leafs, train_results, 'b', label='Train AUC')
line2, = plt.plot(min_samples_leafs, test_results, 'r', label='Test AUC')
plt.legend()
plt.xlabel('min samples leaf')
plt.ylabel('AUC score')
plt.show()

In [None]:
min_impurity_decrease_list = np.linspace(0.0, 0.5, num=100, endpoint=True)

train_results = []
test_results = []
for min_impurity_decrease in min_impurity_decrease_list:
    model = DecisionTreeClassifier(min_impurity_decrease=min_impurity_decrease, random_state=0)    
    auc_train, auc_pred = train_test_scores(x_train, x_test, y_train, y_test, model)
    train_results.append(auc_train)
    test_results.append(auc_pred)

line1, = plt.plot(min_impurity_decrease_list, train_results, 'b', label='Train AUC')
line2, = plt.plot(min_impurity_decrease_list, test_results, 'r', label='Test AUC')
plt.legend()
plt.xlabel('min impurity decrease')
plt.ylabel('AUC score')
plt.show()

In [None]:
parameters = {'criterion': ('gini', 'entropy'), 
              'max_depth': range(1,17),
              'min_samples_split': np.linspace(0.05, 0.4, num=10, endpoint=True),
              'min_samples_leaf': np.linspace(0.05, 0.2, num=10, endpoint=True),
              }
model_DecTreeClass = DecisionTreeClassifier(random_state=0)
model_param = GridSearchCV(model_DecTreeClass, parameters, cv=5)

model_param.fit(x_train, y_train)
y_pred_param = model_param.predict(x_test)

print('Model parameters: ', model_param.best_params_)
print('-----------------------')
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_param)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC with parameter tuning: %.2f' % (roc_auc*100), "%")

In [None]:
parameters = {'criterion': ('gini', 'entropy'), 
              'max_depth': range(1,17)
              }
model_DecTreeClass = DecisionTreeClassifier(random_state=0)
model_param = GridSearchCV(model_DecTreeClass, parameters, cv=5)

model_param.fit(x_train, y_train)
y_pred_param = model_param.predict(x_test)

print('Model parameters: ', model_param.best_params_)
print('-----------------------')
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_param)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('AUC with parameter tuning: %.2f' % (roc_auc*100), "%")