# [Binary Classification with a Bank Churn Dataset](https://www.kaggle.com/competitions/playground-series-s4e1/overview)
### Playground Series - Season 4, Episode 1
_______________________________________________________________________ 
# Author Details:
- Name: Najeeb Haider Zaidi
- Email: zaidi.nh@gmail.com
- Profiles: [Github](https://github.com/snajeebz)  [LinkedIn](https://www.linkedin.com/in/najeebz) [Kaggle](https://www.kaggle.com/najeebz)
- Prepared for the submission to the competition.
________________________________________________________________________
# Attributions:


[Walter Reade, Ashley Chow. (2024). Binary Classification with a Bank Churn Dataset . Kaggle.](https://kaggle.com/competitions/playground-series-s4e1)
________________________________________________________________________
​
This Notebook is to be submitted to the competition so aims to start the process from the beginning to the creation of the submission csv file in proper format.
__________________________________________________________________________
# Code Execution and Versioning Repository: 
- [Execute the notebook in Kaggle](https://www.kaggle.com/code/najeebz/binary-classification-cnn-mlp-classifier)
- [Github Repository](https://github.com/snajeebz/playground)
​
____________________________________________________________________
# Citation:

Najeeb Zaidi. (2024). Binary Classification with a Bank Churn Dataset. Competition Submission. Kaggle. https://www.kaggle.com/code/najeebz/binary-classification-cnn-mlp-classifier

# Initiation:

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.exceptions import ConvergenceWarning

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from warnings import filterwarnings;
filterwarnings('ignore')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


# Reading the Dataset

In [2]:
df=pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test=pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')

In [None]:
test

In [None]:
df

# Exploring the Dataset

In [None]:
test.isnull().sum()

In [None]:
df.isnull().sum()

In [None]:
df.Geography.value_counts()

In [None]:
df.NumOfProducts.value_counts()


In [None]:
df.IsActiveMember.describe()

In [None]:
df

# Encoding the Columns

In [48]:
def encoder(df):
    from sklearn.preprocessing import LabelEncoder
    labelencoder = LabelEncoder()
    for col in df.columns:
        if df[col].dtypes=='object':
            df[col]=labelencoder.fit_transform(df[col])
    return df
def Onehot(df):
    from sklearn.preprocessing import OneHotEncoder
    ohe=OneHotEncoder(handle_unknown='ignore',sparse_output=False).set_output(transform='pandas')
    cat_cols=ohe.fit_transform(df.select_dtypes(include=['category']))
    print(cat_cols.columns)
    df=df.drop(columns=df.select_dtypes(include=['category']))
    df=pd.concat(df,cat_cols,axis=1)
    print(df.columns)
    return df
    


# Visualizations:

In [39]:
train_df=df[['Surname','CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary','Exited']]
test_df=test[['Surname','CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']]
train_df[['Geography','Gender','Tenure','NumOfProducts','HasCrCard','IsActiveMember']]=train_df[['Geography','Gender','Tenure','NumOfProducts','HasCrCard','IsActiveMember']].astype('category')
test_df[['Geography','Gender','Tenure','NumOfProducts','HasCrCard','IsActiveMember']]=test_df[['Geography','Gender','Tenure','NumOfProducts','HasCrCard','IsActiveMember']].astype('category')
train_df=encoder(train_df)
test_df=encoder(test_df)


In [42]:
train_df.dtypes

Surname               int64
CreditScore           int64
Geography          category
Gender             category
Age                 float64
Tenure             category
Balance             float64
NumOfProducts      category
HasCrCard          category
IsActiveMember     category
EstimatedSalary     float64
Exited             category
dtype: object

In [None]:
cat_cols = df[['Geography','Gender','Tenure','NumOfProducts','HasCrCard','IsActiveMember','Exited']]
num_cols= train_df[['CreditScore', 'Age','Balance','EstimatedSalary']]

## Categorical Count Plots

In [None]:
for col in cat_cols:
    sns.countplot(df,x=df[col])
    plt.show()

## Histograms of Numerical Columns:

In [None]:
for col in num_cols:
    sns.distplot(df[col],kde=True)
    plt.axvline(df[col].mean(),color='r', label='Mean')
    plt.axvline(df[col].median(),color='y', linestyle='--',label='Median')
    plt.legend()
    plt.show()

## Sunburst Chart ( Understanding the Hierarchy)

In [None]:
import plotly.express as px
fig = px.sunburst(
    df,
    path=['Geography','Gender','NumOfProducts','IsActiveMember','Exited'], 
    color='Exited',color_discrete_map={'1':'gold', '0':'darkblue'},
    width=1200, height=1200
)
fig.show()

## Graphical Relationships between different columns

In [None]:
plt.figure(figsize=(5, 5))
s=sns.barplot(train_df,y='NumOfProducts',x='Exited',hue='Gender')
plt.figure(figsize=(5, 5))
s=sns.barplot(train_df,y='NumOfProducts',x='Exited',hue='Geography', palette='Greens')
plt.figure(figsize=(5, 5))
s=sns.barplot(train_df,y='Age',x='Exited',hue='Geography',palette='Blues')
plt.figure(figsize=(5, 5))
s=sns.barplot(train_df,y='Tenure',x='Exited',hue='Geography',palette='Reds')
plt.figure(figsize=(5, 5))
s=sns.scatterplot(train_df,y='Balance',x='Geography',hue='Exited',palette='crest')

# Preparing for Model:

## Evaluating the Correlations:

In [None]:
test_df

In [None]:
def cluster(X):
    from sklearn import cluster
    agglo = cluster.KMeans(n_clusters=5,random_state=0, n_init="auto")
    agglo.fit(X)
    return ((agglo.labels_+1)/5)

X=train_df[['Age','Gender']]
train_df['Cluster-1']=(cluster(X)+1)/5  
X=train_df[['Gender', 'Age', 'Tenure']]
train_df['Cluster-2']=(cluster(X)+1)/5  
X=train_df[['Age','Tenure']]

X=test_df[['Age','Gender']]
test_df['Cluster-1']=(cluster(X)+1)/5  
X=test_df[['Gender', 'Age', 'Tenure']]
test_df['Cluster-2']=(cluster(X)+1)/5  
X=test_df[['Age','Tenure']]
test_df

In [None]:
X=train_df[['Age']]
train_df['Agebins']=(cluster(X)+1)/5  
X=train_df[['Balance']]
train_df['Balancebins']=(cluster(X)+1)/5 
X=train_df[['EstimatedSalary']]
train_df['ESbins']=(cluster(X)+1)/5
X=test_df[['Age']]
test_df['Agebins']=(cluster(X)+1)/5  
X=test_df[['Balance']]
test_df['Balancebins']=(cluster(X)+1)/5 
X=test_df[['EstimatedSalary']]
test_df['ESbins']=(cluster(X)+1)/5


# Correlation between the Features

In [None]:

corr = train_df.corr()
# plot the heatmap
plt.figure(figsize=(20, 20))
s=sns.heatmap(corr,annot=True, cmap='crest')
corr = test_df.corr()
# plot the heatmap
plt.figure(figsize=(20, 20))
s=sns.heatmap(corr,annot=True, cmap='Blues')

In [None]:
train_df

In [49]:
test_df=Onehot(test_df)
train_df['Exited']=train_df['Exited'].astype('category')

Index(['Geography_France', 'Geography_Germany', 'Geography_Spain',
       'Gender_Female', 'Gender_Male', 'Tenure_0', 'Tenure_1', 'Tenure_2',
       'Tenure_3', 'Tenure_4', 'Tenure_5', 'Tenure_6', 'Tenure_7', 'Tenure_8',
       'Tenure_9', 'Tenure_10', 'NumOfProducts_1', 'NumOfProducts_2',
       'NumOfProducts_3', 'NumOfProducts_4', 'HasCrCard_0.0', 'HasCrCard_1.0',
       'IsActiveMember_0.0', 'IsActiveMember_1.0'],
      dtype='object')


TypeError: concat() takes 1 positional argument but 2 positional arguments (and 1 keyword-only argument) were given

# Creating Test and Train Datasets:

In [None]:
X=train_df.drop(columns=['Exited'])
y=train_df[['Exited']].astype('bool')
X=Onehot(X)
print(X.describe())
print(y.describe())

## Scaling the Input

In [None]:
def scale(X):
    from sklearn import preprocessing
    scaled=preprocessing.StandardScaler()
    scaler=scaled.fit(X)
    X=scaler.transform(X)
    return X

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(scale(X),y,train_size=0.99, random_state=42)
X.keys()
test_df=scale(test_df)

# Creation, Training and Testing of Models (Catboost, LightGBM Classifier)

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(logging_level='Silent')
model.fit(scale(X),y)
model.score(X_test,y=y_test.astype('string'))

In [None]:
from lightgbm import LGBMClassifier 

mod = LGBMClassifier(metric='auc')
mod.fit(X_train, y_train) 
mod.score(X_test,y_test)


In [None]:
model.predict_proba(scale(test_df))[:,1]

# Creating Submission File

In [None]:
#y=np.round_(mod.predict_proba(scale(test_df)),decimals=1)
y=np.round_((0.5* model.predict_proba(scale(test_df))[:,1])+(0.5*mod.predict_proba(scale(test_df))[:,1]),decimals=1)
test['Exited']=y
test.head()

In [None]:
submission=test[['id','Exited']]
submission.to_csv('submission.csv', index=False)

In [None]:
submission