In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings 
warnings.filterwarnings('ignore')

In [None]:
customers = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")

In [None]:
customers.info()

In [None]:
customers.drop(["CLIENTNUM","Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1","Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2" ], axis=1, inplace=True)

In [None]:
customers.head()

In [None]:
customers.info()

In [None]:
for column in enumerate(customers.columns):
    if customers[column[1]].dtype == 'object':
        print(column[1])
        print(customers[column[1]].value_counts())
        print('\n')

#### We see that Attrition_Flag is disbalanced with more Existing Customers than Attrited Customers. In addition the columns: Education_Level, Marital_Status and Income_Category have and Unknown value in consequence I'm going to filter those rows

In [None]:
customers_no_unknown = customers[~((customers['Education_Level']=='Unknown') | (customers['Marital_Status']=='Unknown') |  (customers['Income_Category']=='Unknown'))]

In [None]:
customers_no_unknown.info()

In [None]:
for column in enumerate(customers_no_unknown.columns):
    if customers_no_unknown[column[1]].dtype == 'object':
        print(column[1])
        print(customers_no_unknown[column[1]].value_counts())
        print('\n')

#### We see that there no more unknown values

In [None]:
customers_no_unknown.describe()

In [None]:
## Chek for duplicates
sum(customers_no_unknown.duplicated())

In [None]:
## Converting Attrition_Flag from Object to int
customers_no_unknown.loc[:,"Attrition_Flag"] = customers_no_unknown["Attrition_Flag"].apply(lambda x: 0 if x == "Existing Customer" else 1)

In [None]:
customers_no_unknown['Attrition_Flag'].value_counts()

## General Reporting of the information

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(customers_no_unknown, title="Profiling Report")

In [None]:
profile.to_notebook_iframe()

## EDA of some parameters

In [None]:
fig, ax = plt.subplots(4,2, figsize=(15,20))
sns.boxplot(x='Attrition_Flag', y='Customer_Age' ,data=customers_no_unknown, ax=ax[0,0])
ax[0,0].set_title('Customer age')
sns.boxplot(x='Attrition_Flag', y='Dependent_count' ,data=customers_no_unknown, ax=ax[0,1])
ax[0,1].set_title('Dependent Count')
sns.countplot(x='Attrition_Flag', hue='Gender' ,data=customers_no_unknown, ax=ax[1,0])
ax[1,0].set_title('Dependent Count')
sns.countplot(x='Attrition_Flag', hue='Card_Category' ,data=customers_no_unknown, ax=ax[1,1])
ax[1,1].set_title('Card Category')
sns.countplot(x='Attrition_Flag', hue='Income_Category', hue_order=['Less than $40K','$40K - $60K','$80K - $120K','$60K - $80K','$120K +'] ,data=customers_no_unknown, ax=ax[2,0])
ax[2,0].set_title('Income Category')
sns.countplot(x='Attrition_Flag', hue='Marital_Status',data=customers_no_unknown, ax=ax[2,1])
ax[2,1].set_title('Marital Status')
sns.countplot(x='Attrition_Flag', hue='Education_Level', hue_order=['Uneducated','High School','College','Graduate','Post-Graduate','Doctorate'] ,data=customers_no_unknown, ax=ax[3,0])
ax[3,0].set_title('Education level')
sns.countplot(x='Attrition_Flag',data=customers_no_unknown, ax=ax[3,1])
ax[3,1].set_title('Attrition')
fig.tight_layout()

## Some points to note:
- Average age is : 45
- Average dependent count is: 2
- Gender of male and female is equally distributed
- Credit card category is mostly blue
- Most earn less than 40K and are married or single
- Graduate education level is mostly common

In [None]:
customers_no_unknown_corr = customers_no_unknown.corr()
mask = np.triu(np.ones_like(customers_no_unknown_corr, dtype=bool))
sns.heatmap(customers_no_unknown_corr, mask=mask, cbar=False, cmap="BuGn", linewidths=0.3)

## More correlated features with Attrition_Flag:
- Customer_Age
- Dependent_count
- Months_on_book
- Months_Inactive_12_mon
- Contacts_Count_12_mon
- Credit_Limit
- Avg_Open_To_Buy

Let's explore Months_Inactive_12_mon & Contacts_Count_12_mon

In [None]:
fig, ax = plt.subplots(2,1, figsize=(10,8))
sns.countplot(x='Attrition_Flag', hue='Months_Inactive_12_mon',data=customers_no_unknown, ax=ax[0])
ax[0].set_title('Months Inactive')
sns.countplot(x='Attrition_Flag', hue='Contacts_Count_12_mon', palette="pastel" ,data=customers_no_unknown, ax=ax[1])
ax[1].set_title('No. of Contacts in the last 12 months')
fig.tight_layout()

- Most inactive months are between 2 and 3 months, the same proportion as No. of contacts in the last 12 months

# Classification model

In [None]:
X = customers_no_unknown.iloc[:, 1:].values
y = customers_no_unknown.iloc[:, 0].values

In [None]:
print("X shape", X.shape)
print("y shape", y.shape)

In [None]:
## Categorical values
print(X[:5,[1,3,4,5,6]])

In [None]:
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
cat_features = [1,3,4,5,6]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
cboost_clf = cb.CatBoostClassifier(iterations=300, learning_rate=0.1, random_seed=42)
cboost_clf.fit(X_train, y_train, cat_features=cat_features, verbose=False, plot=False)
y_pred = cboost_clf.predict(X_test)

print(classification_report(y_test,y_pred))

## The F1-score, precision and recall seem to be in a fair range, let's see if we can improve it using SMOTENC (alternative to SMOTE using categorical features)

In [None]:
from imblearn.over_sampling import SMOTENC
smenc = SMOTENC(sampling_strategy = 'auto', k_neighbors = 5, random_state = 42, categorical_features = cat_features)
X_smote, y_smote = smenc.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state = 42)

In [None]:
cboost_clf = cb.CatBoostClassifier(iterations=300, learning_rate=0.1, random_seed=42)
cboost_clf.fit(X_train, y_train, cat_features=cat_features, verbose=False, plot=False)
y_pred = cboost_clf.predict(X_test)

print(classification_report(y_test,y_pred))

## We got some improvement, now some graphs of the most important features for the model

In [None]:
feature_dict = dict(zip(customers_no_unknown.iloc[:,1:].columns, cboost_clf.get_feature_importance()))
sorted_dict = dict(sorted(feature_dict.items(), key=lambda item: item[1], reverse=True))
for column, value in sorted_dict.items():
    print(column,value)

In [None]:
def display_top_n_features_cb(columns, model, n_features):

    feature_dict = dict(zip(columns, model.get_feature_importance()))
    sorted_dict = sorted(feature_dict, key=feature_dict.get, reverse=True)
    top_names = sorted_dict[0:n_features]

    plt.figure(figsize=(8,7))
    plt.title("Feature importance", fontsize = 30)
    plt.bar(range(n_features), [feature_dict[i] for i in top_names], color="r", align="center")
    plt.xlim(-1, n_features)
    plt.xticks(range(n_features), top_names, rotation=90)

display_top_n_features_cb(customers_no_unknown.iloc[:,1:].columns, cboost_clf, 10)

## Acording to the model the more important features are the total transactions, inactivity period, and dependent count
- I appreaciate your time in reading this Notebook and if you have some suggestions , please let me know in the comments. Thank you.