<a href="https://colab.research.google.com/github/Bell993/BankChurners/blob/main/BankChurners_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Introduction

This notebook goes through various statistical techniques and machine learning. Please leave comments about where I can improve and what you liked. Thanks!

# Work in Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

import seaborn as sns
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix

In [None]:
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
df = df[df.columns[:-2]]


In [None]:
df.info()

# Data Cleaning 

In [None]:
def run_int(df,col):
  df = df.replace(col)
  return df

In [None]:

Gender = {'M':1,'F':0}
Attrition_Flag = {'Existing Customer':1, 'Attrited Customer':0}
Income_Category = {'Less than $40K':1,'$40K - $60K':2,'$80K - $120K':4,'$60K - $80K':3,'Unknown':0,'$120K +':4}

df = run_int(df,Gender)
df = run_int(df,Attrition_Flag)
df = run_int(df,Income_Category)
df.info()

In [None]:
Education_Level = {'Graduate':1,'High School':2,'Uneducated':4,'College':3,'0':0,'Post-Graduate':4,'Doctorate':5}
Marital_Status = {'Married':1,'Single':2,'Divorced':3,'0':0}
Card_Category = {'Blue':1,'Silver':2,'Gold':3,'Platinum':0}

In [None]:
example = run_int(df,Education_Level)
example = run_int(example,Marital_Status)
example = run_int(example,Card_Category)
example.info()

# Feature selections


## Shapiro-Wilk 
Test for normality


p > alpha : Sample looks Gaussian

p < alpha : Sample does not look Gaussian

In [None]:
from scipy.stats import shapiro
result = df.copy()
alpha = 0.05
for col in df.columns:
  if result.dtypes[col] != np.object:
    if shapiro(result[col])[1] > alpha:
	    print('Sample looks Gaussian (fail to reject H0)')
    else:
	    print('Sample does not look Gaussian (reject H0)')

## Spearman correlation 
is a method of nonparametric analysis, checking for the normality of the distribution is not required

In [None]:
plt.subplots(figsize=(16,12))
sns.heatmap(example.corr(method='spearman'),annot=True,fmt='.1g')
plt.show()

# Data visualization

## Attrition_Flag

In [None]:
t = df["Attrition_Flag"].value_counts()
fig1, ax1 = plt.subplots()
ax1.pie(t, labels=t.index, autopct='%1.1f%%', shadow=None)
plt.title("Percentage of customers who left", fontsize=14)
plt.show()


## Income category & Gender

In [None]:
sns.displot(data=df, x="Income_Category", col="Gender", kde=True)
example.groupby(['Gender'])['Income_Category'].value_counts().sort_values(ascending=False)

## Credit_limit & Age

In [None]:
df1 = df.copy()
df1["age"] = pd.cut(df.Customer_Age, [18, 30, 40, 50, 60, 70, 80])
sns.boxplot( x = df1['age'], y = df1['Credit_Limit']).set_title('The dependence of the credit limit on age')

## Credit_Limit & Avg_Open_To_Buy

the data is linearly dependent

In [None]:
sns.scatterplot(data=example, x="Credit_Limit", y="Avg_Open_To_Buy")

## Total_Trans_Amt & Total_Trans_Ct
the data is linearly dependent


In [None]:

sns.scatterplot(data=example, x="Total_Trans_Amt", y="Total_Trans_Ct")


## Hypotheses 
about the relationship between the Avg_Utilization_Ratio and Attrited Customer



In [None]:
x = example[example['Attrition_Flag'] == 0]['Avg_Utilization_Ratio']
y = example[example['Attrition_Flag'] == 1]['Avg_Utilization_Ratio']
from scipy.stats import mannwhitneyu
coef,p = mannwhitneyu(x, y)
print('Statistics=%.3f, p=%.3f' % (coef, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution(fail to reject H0)')
else:
	print('Different distribution (reject H0)')
 
sns.displot(data=df, x="Avg_Utilization_Ratio", col="Attrition_Flag", kde=True)

# Data preprocessing

In [None]:
df = pd.get_dummies(df, columns = ['Card_Category', 'Education_Level', 'Marital_Status'])
df.head()

In [None]:
df.drop(columns=['Avg_Open_To_Buy','Total_Trans_Ct'],inplace=True)
df.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
class preprocessing ():
  def __init__ (self,df):
    self.X = df[df.columns[2:]]
    self.y = df[df.columns[1]]
  def scaler(self):
    scaler = StandardScaler()
    self.train = np.array(self.X)
    self.test = np.array(self.y).reshape(-1,1)
    self.train_sc = scaler.fit_transform(self.X)
    print(self.train_sc.shape,self.test.shape)
  def train_test_split(self):
    X_train,X_test,y_train,y_test = train_test_split(self.train_sc,np.ravel(self.test),test_size=0.3)
    return X_train,X_test,y_train,y_test
proc = preprocessing(df)
scaler = proc.scaler()
X_train,X_test,y_train,y_test = proc.train_test_split()

In [None]:
y_test

# Model selection

## GridSearchCV

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix

In [None]:
# параметры xgboost
param_dist = {
    'n_estimators':[i for i in range(50,200,25)], #кол-во деревьев
    'max_depth' : [i for i in range (2,10,2)],
    'learning_rate' : [0.1,0.01],
    'nthread' : [4],

}

In [None]:
model = XGBClassifier()
gs = GridSearchCV(model,param_grid=param_dist,scoring='f1', n_jobs=-1)
result = gs.fit(X_train,y_train)
means = result.cv_results_ ['mean_test_score']


In [None]:
result.best_estimator_.feature_importances_

## Importance Features

In [None]:
plt.figure(figsize=(10,10))
importance = pd.Series(result.best_estimator_.feature_importances_,index=df.columns[2:])
importance.nlargest(7).sort_values(ascending=True).plot(kind='barh')

In [None]:
importance.nsmallest(5).plot(kind='barh')

## XGBoost

In [None]:
result.best_params_

In [None]:
def predict (model,X_train, X_test, y_train, y_test):
  model.fit(X_train,y_train)
  predict = model.predict(X_test)
  print(classification_report (y_test, predict, target_names=['0', '1']))
  cm = confusion_matrix(y_test,predict,labels=model.classes_,normalize='true')
  disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
  disp.plot()

In [None]:
model = XGBClassifier(learning_rate = 0.1, max_depth = 6, n_estimators = 125, nthread = 4)
predict(model,X_train, X_test, y_train, y_test)

## Delete no importance features

In [None]:
df.drop(columns=['Card_Category_Platinum','Card_Category_Platinum','Education_Level_Doctorate','Marital_Status_Divorced'],inplace=True)

In [None]:
proc = preprocessing(df)
scaler = proc.scaler()
X_train,X_test,y_train,y_test = proc.train_test_split()

In [None]:
# параметры xgboost
param_dist = {
    'n_estimators':[i for i in range(50,200,25)], #кол-во деревьев
    'max_depth' : [i for i in range (2,10,2)],
    'learning_rate' : [0.1,0.01],
    'nthread' : [4],

}

In [None]:
model = XGBClassifier()
gs = GridSearchCV(model,param_grid=param_dist,scoring='f1', n_jobs=-1)
result = gs.fit(X_train,y_train)
means = result.cv_results_ ['mean_test_score']


In [None]:
result.best_params_

## XGBoost

In [None]:
model = XGBClassifier(learning_rate = 0.1, max_depth = 6, n_estimators = 150, nthread = 4)
predict(model,X_train, X_test, y_train, y_test)