## Credit Card Churners 

[Description of columns is here](https://www.kaggle.com/sakshigoyal7/credit-card-customers)

The task is to predict churned customers

### EDA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import pandas_profiling
import plotly
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
import statistics

In [None]:
df=pd.read_csv('../input/credit-card-customers/BankChurners.csv')
df=df.iloc[:,:-2]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.info()

No missing values are found

In [None]:
pandas_profiling.ProfileReport(df)

Results: 
1. Dataset is unbalanced   
2. The largest count of clients have an age of approximately 50 years
3. The number of men and women is almost equal (~50% of each group)
4. Customers often have 2-3 dependents
5. Many clients are graduated from the universities or from a high school. So there is 'unknown' education in the dataset  
6. The prevailing number of clients have Blue card
7. Many customers have income less than $40K
8. Credit limit is highly correlated with Average Utilization Ratio and Average Open to Buy a Credit Line is correlated with Credit Line 
9. Clients leave after 3 years of relationship

### Age

In [None]:
#Let's have a look at age distribution of existing customers and churned ones
sns.displot(data=df,x='Customer_Age',hue='Attrition_Flag',kind='hist',palette='tab10')
plt.title('Age hist for existing and churned customers',fontsize=15);

So, this feature isn't very predictive. But the tails of existing customers have more values. Aged people don't tend to leave

In [None]:
#Mode of age distribution
print('Mode of Age distribution is ',statistics.mode(df['Customer_Age']))

### Dependent count

In [None]:
#Maybe number of dependents is the reason of churn?..
sns.catplot(x='Dependent_count',col='Attrition_Flag',data=df,kind='count',palette='summer');

### Education level

In [None]:
fig=go.Figure()
fig.add_trace(go.Histogram(x=df.loc[df['Attrition_Flag']=='Existing Customer']['Education_Level'],
                           histnorm='probability density',marker=dict(color='crimson'),
                           opacity=0.75,name='Education level of existing customers'))
fig.add_trace(go.Histogram(x=df.loc[df['Attrition_Flag']=='Attrited Customer']['Education_Level'],
                           histnorm='probability density',marker=dict(color='dodgerblue'),
                           opacity=0.75,name='Education level of churned customers'))
fig.update_layout(title='Education level',legend=dict(x=0.5,xanchor='center',orientation='h'),margin=dict(l=0,r=0,t=30,b=0))
fig.show()

Churn probability doesn't depend on education level

### Card category and income level

In [None]:
df.groupby(['Card_Category','Income_Category'])['Income_Category'].agg(['count']).sort_values(by='count',ascending=False)

In each card category there are all income levels

In [None]:
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(15,7))
df_cards_ex=df.loc[df['Attrition_Flag']=='Existing Customer'].groupby('Card_Category').size()
df_cards_ch=df.loc[df['Attrition_Flag']=='Attrited Customer'].groupby('Card_Category').size()
df_cards_ex.plot(kind='pie',ax=axes[0],title='Card Category of existing customers (left) and churned ones (right)',
                 subplots=True,colormap='cividis')
df_cards_ch.plot(kind='pie',ax=axes[1],subplots=True,colormap='cividis')
axes[0].set_ylabel('')
axes[1].set_ylabel('');

The distribution of cards between existing and churned customers is equal

### Months of relationship 

In [None]:
df_exist=df.loc[df['Attrition_Flag']=='Existing Customer']
df_churn=df.loc[df['Attrition_Flag']=='Attrited Customer']

colors=['rgb(0,0,100)','rgb(0,200,200)']
fig=ff.create_distplot([df_exist['Months_on_book'],df_churn['Months_on_book']],['Existing Customer','Attrited Customer'],
                       colors=colors,show_hist=False)
fig.update(layout_title_text='Hist plot of Months of Relationship')

### Total number of products held

In [None]:
fig=go.Figure()
fig.add_trace(go.Histogram(x=df_exist['Total_Relationship_Count'],name='Products held by existing customers',
                           marker_color='#EB89B5',opacity=0.75))
fig.add_trace(go.Histogram(x=df_churn['Total_Relationship_Count'],name='Products held by churned customers',
                           marker_color='#330C73',opacity=0.75))
fig.update_layout(title_text='Hist of total number of products held by two categories of customers',
                  xaxis_title_text='Number of products',bargap=0.2,bargroupgap=0.1)
fig.show()

### All numerical features 

In [None]:
numerical_cols=['Customer_Age','Months_on_book','Total_Relationship_Count', 'Months_Inactive_12_mon','Contacts_Count_12_mon', 
                'Credit_Limit', 'Total_Revolving_Bal','Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
                'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

In [None]:
#Let's have a look at some numerical columns
g=sns.pairplot(df[['Attrition_Flag','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Trans_Amt',
                   'Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']],hue='Attrition_Flag',diag_kind='hist')
g.fig.set_size_inches(15,15);

#### More accurate histograms

In [None]:
fig,axes=plt.subplots(2,2,figsize=(20,15))
list_of_columns=['Credit_Limit','Total_Revolving_Bal','Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']
for i in list_of_columns:           
    sns.histplot(data=df,x=i,hue='Attrition_Flag',ax=axes[int(list_of_columns.index(i)/2),int(list_of_columns.index(i)%2)],
                palette='ocean');

Results:
1. When credit limit is high, clients tend to remain
2. The more client spends, the more likely that he/she will remain
3. If a person does not keep a lot of money in the account the probability of his/her leave will increase 

### Total Transaction Amount (last 12 months)

In [None]:
fig=px.histogram(df,x='Total_Trans_Amt',color='Card_Category')
fig.show()

In [None]:
fig=px.histogram(df,x='Total_Trans_Amt',color='Attrition_Flag',marginal='rug')
fig.show()

### Contacts count

In [None]:
df['Contacts_Count_12_mon'].unique()

In [None]:
sns.catplot(x='Contacts_Count_12_mon',col='Attrition_Flag',data=df,kind='count',ci=None,saturation=5,palette='spring');

### What are the people who churned?  

In [None]:
df_target=df.loc[df['Attrition_Flag']=='Attrited Customer']

In [None]:
fig,axes=plt.subplots(4,3,figsize=(25,20))
list_of_columns=['Education_Level','Marital_Status','Income_Category','Card_Category','Months_on_book', 
                 'Months_Inactive_12_mon', 'Credit_Limit', 'Total_Revolving_Bal','Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
                 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1']
for i in list_of_columns:           
    sns.histplot(data=df_target,x=i,ax=axes[int(list_of_columns.index(i)/3),int(list_of_columns.index(i)%3)],color='green');

### Feature Engineering 

We will remove the columns that don't affect a target variable 

In [None]:
df.drop(['Gender','Dependent_count','Months_on_book','Marital_Status'],axis=1,inplace=True)

### Transformation of categorical features 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer as DV

cat_cols=['Education_Level','Income_Category', 'Card_Category']
encoder = DV(sparse = False)
df_cat=df[cat_cols]
df_cat = encoder.fit_transform(df_cat.T.to_dict().values())

In [None]:
df_cat_encoded=pd.DataFrame(df_cat,columns=encoder.get_feature_names())
df_cat_encoded.head()

In [None]:
#Encode target variable
target_variable=pd.get_dummies(df['Attrition_Flag'])

In [None]:
target_variable.tail()

In [None]:
del numerical_cols[1]

In [None]:
#Normalize numerical variables
from sklearn.preprocessing import StandardScaler


num_variables=df[numerical_cols]
#numerical columns without correlated columns:
num_variables.drop(['Avg_Open_To_Buy', 'Avg_Utilization_Ratio'],axis=1,inplace=True)

scaler=StandardScaler()
normalize_num_variables=scaler.fit_transform(num_variables)
normalize_num_variables=pd.DataFrame(normalize_num_variables,columns=num_variables.columns)

In [None]:
normalize_num_variables.head()

In [None]:
from pandas.plotting import scatter_matrix

list_cols = ['Customer_Age', 'Credit_Limit', 'Total_Revolving_Bal','Total_Amt_Chng_Q4_Q1', 
             'Total_Trans_Amt','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1']
scatter_matrix(normalize_num_variables[list_cols], figsize=(20, 20))
plt.show()

In [None]:
X=pd.concat([target_variable['Attrited Customer'],normalize_num_variables,df_cat_encoded],axis=1)
X.head()

In [None]:
X.columns

### Train/Test split 

In [None]:
y=X['Attrited Customer']
X.drop(['Attrited Customer'],axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

(X_train, X_test,y_train, y_test) = train_test_split(X, y,stratify=y,test_size=0.3, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

### Logistic Regression

In [None]:
from sklearn.metrics import accuracy_score, recall_score, roc_curve, roc_auc_score, precision_score, confusion_matrix

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid={'C':np.linspace(0.1,10,20)}
cv=5

estimator=LogisticRegression(penalty='l1',class_weight='balanced',fit_intercept=True,max_iter=100,solver='liblinear')
grid_balanced=GridSearchCV(estimator,param_grid,cv=cv)
grid_balanced.fit(X_train,y_train)

In [None]:
grid_balanced.best_estimator_

In [None]:
grid_balanced.best_estimator_.coef_

In [None]:
y_pred=grid_balanced.best_estimator_.predict(X_test)

In [None]:
y_pred_proba=grid_balanced.best_estimator_.predict_proba(X_test)[:,1]
roc_auc_score(y_test,y_pred_proba)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
recall_score(y_test,y_pred)

In [None]:
precision_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
def roc_auc_curve_with_thr(y_test,y_pred_proba):  
    fpr,tpr,treshold=roc_curve(y_test,y_pred_proba)
    fpr,tpr=fpr.reshape(len(fpr),1),tpr.reshape(len(tpr),1)

    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr)
    plt.title('ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate') 
    return plt.show()

In [None]:
roc_auc_curve_with_thr(y_test,y_pred_proba)

### XGBoost

In [None]:
import xgboost as xgb

In [None]:
for i in np.linspace(10,100,10):
    clf=xgb.XGBClassifier(verbosity=0,use_label_encoder=False,scale_pos_weight=i)
    clf.fit(X_train,y_train)
    y_pred_clf=clf.predict(X_test)
    y_pred_proba_clf=clf.predict_proba(X_test)[:,1]
    print('Scale weights: ',i,'Recall score: ',recall_score(y_test,y_pred_clf),'Precision score: ',
          precision_score(y_test,y_pred_clf),'Roc auc score: ',roc_auc_score(y_test,y_pred_clf))

In [None]:
#Train the best model
clf=xgb.XGBClassifier(verbosity=0,use_label_encoder=False,scale_pos_weight=60)
clf.fit(X_train,y_train)
y_pred_clf=clf.predict(X_test)
y_pred_proba_clf=clf.predict_proba(X_test)[:,1]

In [None]:
recall_score(y_test,y_pred_clf)

In [None]:
precision_score(y_test,y_pred_clf)

In [None]:
roc_auc_score(y_test,y_pred_clf)

In [None]:
confusion_matrix(y_test,y_pred_clf)

In [None]:
roc_auc_curve_with_thr(y_test,y_pred_proba_clf)

In [None]:
#Feature importances
from xgboost import plot_importance
fig, ax = plt.subplots(figsize=(12, 15))
plot_importance(clf,ax=ax);