In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 25)

In [None]:
# Read dataset from csv file. Dropped last two columns and first column which is client number. 

headers=pd.read_csv("../input/credit-card-customers/BankChurners.csv",nrows=1)
df1=pd.read_csv("../input/credit-card-customers/BankChurners.csv",usecols=headers.iloc[:,1:-2].columns)
df1.head()

In [None]:
# Checking dtypes and missing values

df1.info()

> No missing values in dataset. There are 6 categorical and 14 numeric features.

In [None]:
# Checking basic descriptive statistics for numeric and categorical variables.

df1.describe(include="all").T.sort_values(by=["unique"])

> **Some observations from the table:**
> 
> * Majority of clients are married.
> * Almost all clients have Blue Card (~%93)
> * Income Category is dominated by "who has Less than 40K"
> * Avg customer age is 46, min age is 26 and max age is 73
> * Credit card limits are between 1.438 and 34.516
> * Although min Avg_Open_To_Buy is 3, max value is 34.516. There is a big gap between max and min values. This situation strengthens the presence of outliers.

In [None]:
#Creating a Churn feature that depends on Attrition_Flag values  
#Removed Attrition_Flag feature

df1.loc[:,'Churn'] = np.where((df1['Attrition_Flag'] == 'Attrited Customer'),"1","0")
df1.drop("Attrition_Flag",axis=1,inplace=True)
df1.head()

# Exploratory Data Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use("tableau-colorblind10") 

In [None]:
# Split the dataset 2 parts that categorical and numerical

cat_col=list(df1.select_dtypes(include="object").columns)
num_col=list(df1.select_dtypes(exclude="object").columns)
print("Categorical Features:",cat_col,sep="\n\n")
print("")
print("Numerical Features:",num_col,sep="\n\n")

# Categorical Features

In [None]:
def countplot_categorical(cats,data):
    
    fig, axes = plt.subplots(2,3, figsize=(22, 16))
    axes = axes.flatten()
    fig.suptitle('Categorical Features Distributions',fontsize=30)

    for ax, cat in zip(axes, cats):
        total = float(len(data[cat]))
        sns.countplot(data[cat], palette='rocket', ax=ax)

        for p in ax.patches:
            height = p.get_height()
            ax.text(p.get_x() + p.get_width() / 2.,
                    height + 10,
                    '{:1.2f}%'.format((height / total) * 100),
                    ha="center")
        
        ax.set_xticklabels(ax.get_xticklabels(),rotation=20)
        plt.ylabel('Count', fontsize=15, weight='bold')

In [None]:
 countplot_categorical(cat_col, df1)

In [None]:
def countplot_churn(cats, data):
    
    
    fig, axes = plt.subplots(2, 3, figsize=(22, 16))
    axes = axes.flatten()
    fig.suptitle('Categorical Features Distributions by Churn',fontsize=30)
    
    for ax, cat in zip(axes, cats):
        if cat == 'Churn':
            sns.countplot(data[cat], palette='rocket', ax=ax)

        else:

            sns.countplot(x=cat,
                          data=data,
                          hue='Churn',
                          palette='rocket',
                          ax=ax)
            ax.legend(title='Churn',
                      loc='upper right',
                      labels=['No', 'Yes'])
            
        ax.set_xticklabels(ax.get_xticklabels(),rotation=20)
        plt.ylabel('Count', fontsize=15, weight='bold')

In [None]:
countplot_churn(cat_col, df1)

> Some observations from the charts:

> * Educational Level and Income Category will be transformed to ordinal variables.
> * Other categorical variables are nominal so i can use "get_dummies".
> * It looks like that churn rate is higher for "single marital status" than "other marital status".
> * %16 churn ratio means that the dataset is imbalanced  so i have to use oversampling  or undersampling  methods for better accuracy.

# Feature Engineering for Categorical Variables

In [None]:
# Creating a new copy for variable transformations. 

df2 = df1.copy()

# Transform nominal to ordinal

df2["Education_Level_Ord"]=df2['Education_Level'].replace({"Unknown": 0,
                                                           "Uneducated":1,
                                                           "High School":2,
                                                           "College":3,
                                                           "Graduate":4,
                                                           "Post-Graduate":5,
                                                           "Doctorate":6})

df2["Income_Category_Ord"]=df2['Income_Category'].replace({"Unknown": 0,
                                                           "Less than $40K":1,
                                                           "$40K - $60K":2,
                                                           "$60K - $80K":3,
                                                           "$80K - $120K":4,
                                                           "$120K +":5})

df2.drop(["Education_Level","Income_Category"],axis=1,inplace=True)
df2["Churn"]=df2["Churn"].astype('int64')

# Numerical Features

In [None]:
def kdeplot_churn(nums, data):   
    
    fig, axes = plt.subplots(7, 2, figsize=(25, 18))
    axes = axes.flatten()
    fig.suptitle('Numerical Features Distributions by Churn',fontsize=30)
    
    for ax, num in zip(axes, nums):
        sns.distplot(df2.loc[:, num][(df2["Churn"] == 0)],hist=False,kde=True,color="Blue",ax=ax,label="No")
        sns.distplot(df2.loc[:, num][(df2["Churn"] == 1) ],hist=False,kde=True,color="Red",ax=ax,label="Yes")
        ax.legend(title='Churn',loc='upper right')
        ax.set_xticklabels(ax.get_xticklabels(),rotation=20)

In [None]:
kdeplot_churn(num_col,df2)

In [None]:
def boxplot_churn(nums, data):   
    
    fig, axes = plt.subplots(7, 2, figsize=(14,22))
    axes = axes.flatten()
    fig.suptitle('Box Plots for Numerical Features',fontsize=30)
    
    for ax, num in zip(axes, nums):
        sns.boxplot(y=num,data=data,ax=ax,color='#e74c3c')

In [None]:
boxplot_churn(num_col,df2)

> Some observations from the charts:

> * Features that have similar distribution for both churn and not churn such as "Customer age, Months on book etc." are removed.
> * Customers who have lower "Total_Trans_Ct" tend to Churn
> * Customers who have lower "Total_Trans_Amt" tend to Churn
> * Total Relationship Count's churn rate is decreasing after 3.
> * Total_Amt_Chng_Q4_Q1,Total_Trans_Amt and Total_Ct_Chng_Q4_Q1 features have lots of outliers so i'll transform them to categorical variables using qcut function.(Quantile-based discretization function)

# Feature Engineering for Numerical Variables

In [None]:
# Creating a new copy for variable transformations. 

df3 = df2.copy()

# Discretization
df3["Total_Amt_Chng_Q4_Q1_qcut"]=pd.qcut(df3["Total_Amt_Chng_Q4_Q1"],4)
df3["Total_Trans_Amt_qcut"]=pd.qcut(df3["Total_Trans_Amt"],4)
df3["Total_Ct_Chng_Q4_Q1_qcut"]=pd.qcut(df3["Total_Ct_Chng_Q4_Q1"],4)


df3.drop(["Customer_Age","Months_on_book",
          "Credit_Limit","Avg_Open_To_Buy",
          "Total_Amt_Chng_Q4_Q1","Total_Trans_Amt",
          "Total_Ct_Chng_Q4_Q1"],axis=1,inplace=True)


In [None]:
df4 = pd.get_dummies(df3)

df4.head()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
fig.suptitle('Correlation between Churn and features',fontsize=20)
ax=sns.heatmap(df4.corr()[["Churn"]].sort_values("Churn"),vmax=1, vmin=-1, cmap="YlGnBu", annot=True, ax=ax);
ax.invert_yaxis()

In [None]:
# Drop some features which have less than 0.01 correlation and greater than -0.01 correlation.

df5=df4.copy()
threshold=0.01
churn_corr=df5.corr()[["Churn"]].sort_values("Churn")
corr_drop=list(churn_corr[(churn_corr["Churn"]< threshold)& (churn_corr["Churn"]>-threshold)].index)
df5.drop(corr_drop,axis=1,inplace=True)

df5.info()

# Modelling

In [None]:
from sklearn.model_selection import train_test_split,cross_val_predict,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,roc_auc_score,roc_curve
from imblearn.pipeline import Pipeline as imbPipe
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X=df5.drop("Churn",axis=1)
y=df5["Churn"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True, stratify = y)

# Training Hard Voting Classifier

In [None]:
dct = DecisionTreeClassifier(random_state=42)
sgd = SGDClassifier(random_state=42)
log = LogisticRegression(random_state=42)
svm_rbf = SVC(kernel="rbf", random_state=42)
svm_lin = LinearSVC(loss="hinge")
knn=KNeighborsClassifier()

kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)


Voting_pipeline = imbPipe([
    
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42,n_jobs=-1)),
    ("voting", VotingClassifier(estimators=[("dct", dct),
                                            ("sgd", sgd),
                                            ("svm_rbf", svm_rbf),
                                            ("smv_lin", svm_lin),
                                            ("knn",knn),
                                            ("log", log)],voting="hard",n_jobs=-1))
])


y_pred = cross_val_predict(Voting_pipeline, X_train, y_train, cv = kfold)
print(classification_report(y_train, y_pred))


# Test Results

In [None]:
Voting_pipeline.fit(X_train, y_train)
y_pred=Voting_pipeline.predict(X_test)  
print(classification_report(y_test, y_pred))

In [None]:
fpr, tpr, thresholds =roc_curve(y_test, y_pred, pos_label=1)
roc_auc=roc_auc_score(y_test, y_pred)
plt.figure( figsize=(14,6))
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()