In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
from datetime import date

from warnings import filterwarnings
filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Reading Dataset

In [None]:
data = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
print(data.shape)
data.head()

**Remove the unnecessary column**

In [None]:
data = data.iloc[:,1:21]
data.info()

In [None]:
data.describe()

In [None]:
numerical = [var for var in data.columns if data[var].dtype!='O']
print('There are {} numerical variables'.format(len(numerical)))
print('The numerical variables are :', numerical)

In [None]:
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables'.format(len(categorical)))
print('The categorical variables are :', categorical)

In [None]:
for var in categorical: 
    print(data[var].value_counts())
    print(data[var].value_counts()/np.float(len(data)))
    print()

**Delete the unknown values**

In [None]:
data.replace({'Unknown':np.nan},inplace=True)
data = data.dropna()
print(data.shape)

In [None]:
for var in categorical:
    print(var, ' contains ', len(data[var].unique()), ' labels')

In [None]:
df= data.copy()
df['Attrition_Flag'] = df.Attrition_Flag.replace({'Existing Customer':0,'Attrited Customer':1}).astype('int64')

## 2. Exploratory Data Analysis

#### Pie chart for target

In [None]:
data['Attrition_Flag'].value_counts()

In [None]:
colors=('#DD7596','#8EB897')
explode=[0,0.1]
data['Attrition_Flag'].value_counts().plot(kind='pie',shadow=True,explode=explode,colors=colors,autopct='%.2f',figsize=(8,6))
plt.title('Ratio of customer')
plt.show()

#### Correlation between numerical features

In [None]:
plt.subplots(figsize=(20,10))
plt.title('Correlation between variables')
sns.heatmap(df.corr(),annot=True,square=True,cmap='YlGnBu')

In [None]:
numcorr = df.corr()
Num = numcorr['Attrition_Flag'].sort_values(ascending=False).to_frame()
cm = sns.color_palette('YlGnBu', as_cmap=True)
s = Num.style.background_gradient(cmap=cm)
s

#### Univariate Distribution and Bivariate Distribution

In [None]:
sns.displot(data['Total_Trans_Ct'], rug=True)
plt.show()

sns.displot(data['Total_Ct_Chng_Q4_Q1'])
plt.show()

In [None]:
sns.jointplot(x=data['Total_Trans_Ct'],y=data['Total_Ct_Chng_Q4_Q1'],hue=data['Attrition_Flag'])

#### Visualization for Categorical Variables

* Attrition_Flag  contains  2  labels
* Gender  contains  2  labels
* Education_Level  contains  6  labels
* Marital_Status  contains  3  labels
* Income_Category  contains  5  labels
* Card_Category  contains  4  labels

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of Gender colored by customers')
sns.countplot(data=df,x='Gender',hue='Attrition_Flag')

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of Education Level colored by customers')
sns.countplot(data=df,x='Education_Level',hue='Attrition_Flag')

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of Marital Status colored by customers')
sns.countplot(data=df,x='Marital_Status',hue='Attrition_Flag')

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of Income Category colored by customers')
sns.countplot(data=df,x='Income_Category',hue='Attrition_Flag')

In [None]:
plt.figure(figsize=(8,4))
plt.title('count of Card Category colored by customers')
sns.countplot(data=df,x='Card_Category',hue='Attrition_Flag')

## 3. Data Preprocessing

#### Label Encoding

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
 
data['Attrition_Flag'] = label_encoder.fit_transform(data['Attrition_Flag'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data['Income_Category'] = label_encoder.fit_transform(data['Income_Category'])
data['Card_Category'] = label_encoder.fit_transform(data['Card_Category'])
data.head()

#### Dummy Variable Encoding

In [None]:
print(data['Education_Level'].unique())
print(data['Marital_Status'].unique())

In [None]:
data = pd.get_dummies(data,columns=['Education_Level','Marital_Status'],prefix=['Education','Status'])
data.head()

## 4. Feature Engineering

#### (A) Feature Scaling

In [None]:
features = data.drop('Attrition_Flag',axis=1)
target = data['Attrition_Flag']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
scaled_values = scaler.fit_transform(features) 
features.loc[:,:] = scaled_values
features.head()

#### (B) Balance the target

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=0)
X,y = sm.fit_resample(features,target)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## 5. Building Model

In [None]:
from sklearn.metrics import accuracy_score,recall_score,precision_score
from sklearn.metrics import f1_score,confusion_matrix,roc_auc_score

def evaluation(y_test,y_pred):
  acc = accuracy_score(y_test,y_pred)
  rcl = recall_score(y_test,y_pred)
  f1 = f1_score(y_test,y_pred)
  auc_score = roc_auc_score(y_test,y_pred)
  prec_score = precision_score(y_test,y_pred)
 

  metric_dict={'accuracy': round(acc,3),
               'recall': round(rcl,3),
               'F1 score': round(f1,3),
               'auc score': round(auc_score,3),
               'precision': round(prec_score,3) 
              }

  return print(metric_dict)

In [None]:
Results = pd.DataFrame({'Model': [],'Accuracy Score': [], 'Recall':[], 'F1score':[]})

#### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)
y_pred1 = tree.predict(X_test)
res = pd.DataFrame({"Model":['DecisionTreeClassifier'],
                    "Accuracy Score": [accuracy_score(y_pred1,y_test)],
                    "Recall": [recall_score(y_test,y_pred1)],
                    "F1score": [f1_score(y_test,y_pred1)]})
Results = Results.append(res)

In [None]:
pd.crosstab(y_test,y_pred1,rownames=['Real data'],colnames=['Predicted'])

#### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred2 = rfc.predict(X_test)
res = pd.DataFrame({"Model":['RandomForestClassifier'],
                    "Accuracy Score": [accuracy_score(y_pred2,y_test)],
                    "Recall": [recall_score(y_test,y_pred2)],
                    "F1score": [f1_score(y_test,y_pred2)]})
Results = Results.append(res)

In [None]:
pd.crosstab(y_test,y_pred2,rownames=['Real data'],colnames=['Predicted'])

#### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred3 = knn.predict(X_test)
res = pd.DataFrame({"Model":['KNeighborsClassifier'],
                    "Accuracy Score": [accuracy_score(y_pred3,y_test)],
                    "Recall": [recall_score(y_test,y_pred3)],
                    "F1score": [f1_score(y_test,y_pred3)]})
Results = Results.append(res)

In [None]:
pd.crosstab(y_test,y_pred3,rownames=['Real data'],colnames=['Predicted'])

#### SVC

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train,y_train)
y_pred4 = svc.predict(X_test)
res = pd.DataFrame({"Model":['SVC'],
                    "Accuracy Score": [accuracy_score(y_pred4,y_test)],
                    "Recall": [recall_score(y_test,y_pred4)],
                    "F1score": [f1_score(y_test,y_pred4)]})
Results = Results.append(res)

In [None]:
pd.crosstab(y_test,y_pred4,rownames=['Real data'],colnames=['Predicted'])

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred5 = lr.predict(X_test)
res = pd.DataFrame({"Model":['LogisticRegression'],
                    "Accuracy Score": [accuracy_score(y_pred5,y_test)],
                    "Recall": [recall_score(y_test,y_pred5)],
                    "F1score": [f1_score(y_test,y_pred5)]})
Results = Results.append(res)

In [None]:
pd.crosstab(y_test,y_pred5,rownames=['Real data'],colnames=['Predicted'])

#### XGB Classifier

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred6 = xgb.predict(X_test)
res = pd.DataFrame({"Model":['XGBClassifier'],
                    "Accuracy Score": [accuracy_score(y_pred6,y_test)],
                    "Recall": [recall_score(y_test,y_pred6)],
                    "F1score": [f1_score(y_test,y_pred6)]})
Results = Results.append(res)

In [None]:
pd.crosstab(y_test,y_pred6,rownames=['Real data'],colnames=['Predicted'])

In [None]:
Results.sort_values(by='Accuracy Score',ascending=True)

How we see - **XGBClassifier** gives the best results

## 6. Model Evalution

In [None]:
from sklearn.metrics import classification_report,plot_confusion_matrix,plot_roc_curve

### XGB Classifier

In [None]:
model = XGBClassifier(objective='binary:logistic',eval_metric=['logloss'])
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [None]:
print(" Best evaluation parameters achieved with XGBClassifier:") 
evaluation(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
plot_confusion_matrix(model,X_test,y_test,cmap='YlOrBr')

In [None]:
plot_roc_curve(model,X_test,y_test)

#### Feature Importance

In [None]:
feat_importance = model.feature_importances_
feat_importance = pd.DataFrame(feat_importance,columns=['Score'],index=features.columns)

In [None]:
feat_importance.sort_values(by='Score',ascending=False).style.background_gradient(cmap='OrRd')

In [None]:
plt.figure(figsize=(16,8))
plt.title('Feature Importances')
sns.barplot(x=feat_importance.Score,y=feat_importance.index)

**Conclusion:**

I got maximum accuracy score of 0.982 on **XGBClassifier**.