In [None]:
from pathlib import Path

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, FunctionTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report

%pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [None]:
file_path = Path('../input/credit-card-customers/BankChurners.csv')

churn_data = pd.read_csv(file_path)
churn_data = churn_data.iloc[:,:-2]
churn_data.head(5)

In [None]:
# Get dataset information 

print(churn_data.info())

In [None]:
# statistic summary
churn_data.describe()

# Exploratory Data Analysis 

Exploratory Data Analysis (EDA) is the first step (and common) for Data analysis or Data Sceince flow. The main concept behind EDA is to indentify what kind of data, understanding the pattern, and trying to find some valuable information in the data (insight). In general, EDA is carried out in several ways:
* Univariate Analysis — descriptive analysis with one variable.
* Bivariate Analysis — analysis relationship between two variable.
* Multivariate Analysis — analysis with three or more variable.

### Univariate (Target Variable)

In [None]:
# we want to see our target variable proportion (Attrition Flag)
# between existing customer (no churn) and  attrited customer (churn)
churn = churn_data['Attrition_Flag'].value_counts()
fig = plt.figure(figsize=(6,6))
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
labels = churn.index.tolist()
ax.pie(churn, labels=labels, autopct='%.1f%%', textprops={'size': 16})
plt.title('Attrition Flag Columns Precentage', fontsize=15)
plt.show()

## Numeric Feature vs Target

In [None]:
# create list of columns that contain numeric value 
num_columns = ['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal',
               'Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1','Total_Trans_Amt','Total_Trans_Ct',
               'Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']

In [None]:
# cumulative distribution function
def cdf(data):
    n = len(data)
    x = np.sort(data)
    y = np.arange(1, n + 1) / n
    return x, y

# distribution visualization function 
def distplot_num(df, col, bins=None):
    fig, ax = plt.subplots(ncols=2, figsize=(12,6))
    dist1 =sns.distplot(df[df['Attrition_Flag'] == 'Existing Customer'][col], bins=bins,
                 color='blue', label='Existing Customer', ax=ax[0])
    _ = sns.distplot(df[df['Attrition_Flag'] == 'Attrited Customer'][col], bins=bins,
                 color='orange', label='Attrited Customer', ax=ax[0])
    dist1.set(title=f'Probalbility Mass Function')
    ax[0].legend()
    

    x_exist, y_exist = cdf(df[df['Attrition_Flag'] == 'Existing Customer'][col])
    x_attrited, y_attrited = cdf(df[df['Attrition_Flag'] == 'Attrited Customer'][col])
    
    dist2 = sns.lineplot(x_exist, y_exist, markers=True, color='blue', ax=ax[1])
    _ = sns.lineplot(x_attrited, y_attrited, markers=True, color='orange', ax=ax[1])
    
    dist2.set(xlabel=col, title=f'Cumulative Distribution Function')
    plt.tight_layout()
    plt.show()
    

# measures of central tendency
def central_tendency_num(df, col):
    """
    Calculate measurement of central tendency for numeric feature using mean
    
    Args:
    df: pandas.DataFrame
    col: selected column that want to being calculated
    
    """
    mean_exist_cust = df[df['Attrition_Flag'] == 'Existing Customer'][col].mean()
    mean_attrited_cust = df[df['Attrition_Flag'] == 'Attrited Customer'][col].mean()
    print(f"{col} mean for Existing Customer: {mean_exist_cust}")
    print(f"{col} mean for Attrited Customer: {mean_attrited_cust}")
    
# measures of dispersion
def dispersion_num(df, col):
    """
    Calculate measurement of dispersion from distribution (using standar deviation)
    
    Args:
    df: pandas.DataFrame 
    col: selected column that want to being calculated
    
    """
    dispersion_exist_cust = df[df['Attrition_Flag'] == 'Existing Customer'][col].var()
    dispersion_attrited_cust = df[df['Attrition_Flag'] == 'Attrited Customer'][col].var()
    print(f"{col} dispersion (var) for Existing Customer: {dispersion_exist_cust}")
    print(f"{col} dispersion (var) for Attrited Customer: {dispersion_attrited_cust}")

In [None]:
# visualize Customer_Age distribution
distplot_num(churn_data, num_columns[0], bins=[20,40,60,80])
central_tendency_num(churn_data, num_columns[0])
dispersion_num(churn_data, num_columns[0])

print(f'kurtosis: {churn_data[num_columns[0]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[0]].skew()}')

In [None]:
# visualize Months_on_book distribution  
distplot_num(churn_data, num_columns[1], bins=None)
central_tendency_num(churn_data, num_columns[1])
dispersion_num(churn_data, num_columns[1])

print(f'kurtosis: {churn_data[num_columns[1]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[1]].skew()}')

In [None]:
# visualize Credit_Limit distribution  
distplot_num(churn_data, num_columns[2], bins=None)
central_tendency_num(churn_data, num_columns[2])
dispersion_num(churn_data, num_columns[2])

print(f'kurtosis: {churn_data[num_columns[2]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[2]].skew()}')

In [None]:
# visualize Total_Revolving_Bal distribution  
distplot_num(churn_data, num_columns[3])
central_tendency_num(churn_data, num_columns[3])
dispersion_num(churn_data, num_columns[3])

print(f'kurtosis: {churn_data[num_columns[3]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[3]].skew()}')

In [None]:
# visualize Avg_Open_To_Buyl distribution  
distplot_num(churn_data, num_columns[4])
central_tendency_num(churn_data, num_columns[4])
dispersion_num(churn_data, num_columns[4])

print(f'kurtosis: {churn_data[num_columns[4]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[4]].skew()}')

In [None]:
# visualize Total_Amt_Chng_Q4_Q1 distribution  
distplot_num(churn_data, num_columns[5])
central_tendency_num(churn_data, num_columns[5])
dispersion_num(churn_data, num_columns[5])

print(f'kurtosis: {churn_data[num_columns[5]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[5]].skew()}')

In [None]:
# visualize Total_Trans_Amt distribution  
distplot_num(churn_data, num_columns[6])
central_tendency_num(churn_data, num_columns[6])
dispersion_num(churn_data, num_columns[6])

print(f'kurtosis: {churn_data[num_columns[6]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[6]].skew()}')

In [None]:
# visualize Total_Trans_Ct distribution  
distplot_num(churn_data, num_columns[7])
central_tendency_num(churn_data, num_columns[7])
dispersion_num(churn_data, num_columns[7])

print(f'kurtosis: {churn_data[num_columns[7]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[7]].skew()}')

In [None]:
# visualize Total_Ct_Chng_Q4_Q1 distribution  
distplot_num(churn_data, num_columns[8])
central_tendency_num(churn_data, num_columns[8])
dispersion_num(churn_data, num_columns[8])

print(f'kurtosis: {churn_data[num_columns[8]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[8]].skew()}')

In [None]:
# visualize Avg_Utilization_Ratio distribution  
distplot_num(churn_data, num_columns[9])
central_tendency_num(churn_data, num_columns[9])
dispersion_num(churn_data, num_columns[9])

print(f'kurtosis: {churn_data[num_columns[9]].kurtosis()}')
print(f'skewness: {churn_data[num_columns[9]].skew()}')

## Categoric Feature vs Target

In [None]:
cat_columns = ['Gender','Dependent_count','Education_Level','Marital_Status','Income_Category', 
               'Card_Category', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Relationship_Count']

In [None]:
fig, axes = plt.subplots(3,3, figsize=(15,15))

for col, ax in zip(cat_columns, axes.flatten()):
    # cross category column with target column (normalized)
    cross_tab = pd.crosstab(churn_data[col], churn_data['Attrition_Flag'])
    cross_tab.plot.bar(ax=ax)
    plt.suptitle('Categorical Feature Proportion Plot', y=1, va=
                 'baseline', fontsize=20)
    plt.tight_layout()
plt.show()

## Feature Selection and Feature Engineering

### Feature Selection

In [None]:
# to reduce dimensional of data, we have to select what feature to be used as predictor 
# according to EDA that we've done, let's select what feature that give more useful information and possibilities. 
# check dataset columns
print(churn_data.columns)

In [None]:
# remove uneeded columns
churn_data_selected = churn_data.drop(['CLIENTNUM', 'Credit_Limit', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Months_on_book', 'Card_Category'], axis=1)

### Feature Engineering

In [None]:
# binning for customer age feature (transform numerical to categorical )

churn_data_selected['Customer_Age'] = pd.cut(churn_data_selected['Customer_Age'], [20,40,60,80],
                                            labels=["early adult", "middle age", "elderly"])

In [None]:
print(churn_data_selected['Education_Level'].unique())
print(churn_data_selected['Income_Category'].unique())

In [None]:
# encode for ordinal feature
# why using manual encode? because LabelEncoder from sklearn perform encoding sorted Alphabetically

# setting education level dict
education_level = {
    'Unknown': -1,
    'Uneducated': 0,
    'High School': 1,
    'College': 2,
    'Graduate': 3,
    'Post-Graduate': 4,
    'Doctorate': 5
}

# setting income category dict
income_category = {
    'Unknown': -1,
    'Less than $40K': 0,
    '$40K - $60K': 1,
    '$60K - $80K': 2,
    '$80K - $120K': 3,
    '$120K +': 4,
}

age_level = {
    "early adult": 0, 
    "middle age": 1,
    "elderly": 2,
}

# notes: we set Unknown with value -1 because we cannot assume what it's level

# replace education level to integer
churn_data_selected['Education_Level'] = churn_data_selected['Education_Level'].map(education_level)
churn_data_selected['Income_Category'] = churn_data_selected['Income_Category'].map(income_category)
churn_data_selected['Customer_Age'] = churn_data_selected['Customer_Age'].map(age_level)

### Dataset Splitting

In [None]:
X = churn_data_selected.drop(columns='Attrition_Flag')
y = churn_data_selected['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'train shape: {X_train.shape, y_train.shape}')
print(f'test shape: {X_test.shape, y_test.shape}')

### Preprocessing and ML Pipeline

In [None]:
# preprocessing

numerical_pipeline = Pipeline([
    ('transform', FunctionTransformer(np.log1p))
])

categorical_pipeline = Pipeline([
    ('one_hot', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, ['Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
                                     'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']),
    ('categoric', categorical_pipeline, ['Gender', 'Marital_Status'])
])

# sampling strategy
smote = SMOTE(random_state=42)

### Training Model

In [None]:
# Binomial Logistic Regression

pipeline_log = Pipeline([
    ('preprocess', preprocessor),
    ('resampling', smote),
    ('clf', LogisticRegression(max_iter=500))
])

pipeline_log.fit(X_train, y_train)
y_hat = pipeline_log.predict(X_test)

print(f'classification report logistic:\n{classification_report(y_test, y_hat)}')

In [None]:
# Random Forest
pipeline_rdf = Pipeline([
    ('preprocess', preprocessor),
    ('resampling', smote),
    ('clf', RandomForestClassifier(random_state=42))
])

pipeline_rdf.fit(X_train, y_train)
y_hat = pipeline_rdf.predict(X_test)

print(f'classification report rdf:\n{classification_report(y_test, y_hat)}')

In [None]:
# Support Vector
pipeline_svm = Pipeline([
    ('preprocess', preprocessor),
    ('smote', smote),
    ('clf', SVC())
])

pipeline_svm.fit(X_train, y_train)
y_hat = pipeline_svm.predict(X_test)

print(f'classification report svm:\n{classification_report(y_test, y_hat)}')

### Conclusion

After training several classifier model with simple approach, we can see for each classification report above that Random Forest model is the best (for now) for this classification task with the highest performance (94% accuracy). 

### Next Step?
We can doing hyperparameter tuning.

### Hope this help
Leave comment if there was an error in my explaination especially about EDA. Thanks a lot!!