# **Importing libraries and files**

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RepeatedStratifiedKFold


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv
/kaggle/input/bank-customer-churn-dataset/Bank Customer Churn Prediction.csv
/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv
/kaggle/input/bank-data-customer-data/Bank Customer Churn Prediction.csv


# **1. Import data**

In [2]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv")
original_data = pd.read_csv("/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s4e1/sample_submission.csv")

# **2. Exploratory Data Analysis (EDA)**

In [3]:
df_train.shape

(165034, 14)

In [4]:
df_train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [5]:
original_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [6]:
df_test.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [7]:
submission.head()

Unnamed: 0,id,Exited
0,165034,0.5
1,165035,0.5
2,165036,0.5
3,165037,0.5
4,165038,0.5


In [8]:
df_train.isna().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [9]:
df_train.duplicated().sum()

0

In [10]:
original_data.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

In [11]:
original_data.duplicated().sum()

2

In [12]:
df_train = df_train.drop("id", axis=1)
original_data = original_data.drop("RowNumber", axis=1)

In [13]:
original_data.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [14]:
df_train = pd.concat([df_train, original_data], axis=0)

In [15]:
df_train.shape

(175036, 13)

In [16]:
df_train.duplicated().sum()

2

In [17]:
df_train.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

***
## **We can see that we have some rows that are duplicated and we have some rows that have missing values**

## **For the duplicated rows we will drop them to keep them unique, and for the missing values we could impute them but as we can see it's a negligibale amount so we can drop them as well**
***

In [18]:
df_train = df_train.drop_duplicates()

In [19]:
df_train.duplicated().sum()

0

In [20]:
df_train = df_train.dropna()

In [21]:
df_train.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

## **Here we will do some TF-IDF vectorization for the surnames**

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
import gc


# we return same text, because here we dont do any tokenization because surnames are usually not sentences and regular english language
def dummy(text):
    return text

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, 
    strip_accents='unicode',
    max_features=1000
)

vectorizer.fit(df_train["Surname"])

vocab = vectorizer.vocabulary_

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode', max_features=1000
                            )



train_surnames = vectorizer.fit_transform(df_train["Surname"])
test_surnames = vectorizer.transform(df_test["Surname"])

# we free the space by removing reference to the object vectorizer and use garbage collector
# to remove the unused space from the memory
del vectorizer
gc.collect()

0

### **The TF-IDF will return a huge amount of values, so we do PCA to reduce the space to 10 only**

In [23]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
tfidf_train_pca = pca.fit_transform(train_surnames.toarray())
tfidf_test_pca = pca.transform(test_surnames.toarray())

pca_columns = [f'Surname_PCA_{i+1}' for i in range(10)]
df_train_pca = pd.DataFrame(tfidf_train_pca, columns=pca_columns)
df_test_pca = pd.DataFrame(tfidf_test_pca, columns=pca_columns)

In [24]:
df_train_pca.shape

(175030, 10)

In [25]:
df_train.shape

(175030, 13)

In [26]:
df_train.reset_index(drop=True, inplace=True)
df_train_pca.reset_index(drop=True, inplace=True)

df_train = pd.concat([df_train, df_train_pca], axis="columns")

In [27]:
df_train.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,...,Surname_PCA_1,Surname_PCA_2,Surname_PCA_3,Surname_PCA_4,Surname_PCA_5,Surname_PCA_6,Surname_PCA_7,Surname_PCA_8,Surname_PCA_9,Surname_PCA_10
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,...,0.428648,0.055544,0.043289,0.003998,0.005236,0.008662,0.01816,0.006281,0.011951,-0.000572
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,...,0.105775,0.002193,-0.00823,-0.002709,-0.01663,-0.022713,0.059528,0.025506,0.012872,-0.005638
2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,...,-0.03608,-0.022063,-0.041293,-0.017429,-0.025288,-0.045381,-0.020503,-0.018278,0.009271,-0.01752
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,...,-0.035993,-0.027718,-0.04274,-0.007404,-0.036893,-0.040253,-0.020049,-0.061541,-0.094841,0.187114
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,...,-0.028679,-0.031911,-0.02648,-0.012946,-0.040886,-0.05627,-0.057078,-0.039937,-0.11576,-0.184608


In [28]:
df_test.reset_index(drop=True, inplace=True)
df_test_pca.reset_index(drop=True, inplace=True)

df_test = pd.concat([df_test, df_test_pca], axis="columns")

In [29]:
df_train = df_train.drop("Surname", axis=1)
df_test  = df_test.drop("Surname", axis=1)

In [30]:
df_train.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_1,Surname_PCA_2,Surname_PCA_3,Surname_PCA_4,Surname_PCA_5,Surname_PCA_6,Surname_PCA_7,Surname_PCA_8,Surname_PCA_9,Surname_PCA_10
0,15674932,668,France,Male,33.0,3,0.0,2,1.0,0.0,...,0.428648,0.055544,0.043289,0.003998,0.005236,0.008662,0.01816,0.006281,0.011951,-0.000572
1,15749177,627,France,Male,33.0,1,0.0,2,1.0,1.0,...,0.105775,0.002193,-0.00823,-0.002709,-0.01663,-0.022713,0.059528,0.025506,0.012872,-0.005638
2,15694510,678,France,Male,40.0,10,0.0,2,1.0,0.0,...,-0.03608,-0.022063,-0.041293,-0.017429,-0.025288,-0.045381,-0.020503,-0.018278,0.009271,-0.01752
3,15741417,581,France,Male,34.0,2,148882.54,1,1.0,1.0,...,-0.035993,-0.027718,-0.04274,-0.007404,-0.036893,-0.040253,-0.020049,-0.061541,-0.094841,0.187114
4,15766172,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,...,-0.028679,-0.031911,-0.02648,-0.012946,-0.040886,-0.05627,-0.057078,-0.039937,-0.11576,-0.184608


## **We get the numerical and categorical columns**

In [31]:
numeric_data = df_train.select_dtypes(include=[np.number])
categorical_data = df_train.select_dtypes(exclude=[np.number])

In [32]:
numeric_data.head()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Surname_PCA_1,Surname_PCA_2,Surname_PCA_3,Surname_PCA_4,Surname_PCA_5,Surname_PCA_6,Surname_PCA_7,Surname_PCA_8,Surname_PCA_9,Surname_PCA_10
0,15674932,668,33.0,3,0.0,2,1.0,0.0,181449.97,0,0.428648,0.055544,0.043289,0.003998,0.005236,0.008662,0.01816,0.006281,0.011951,-0.000572
1,15749177,627,33.0,1,0.0,2,1.0,1.0,49503.5,0,0.105775,0.002193,-0.00823,-0.002709,-0.01663,-0.022713,0.059528,0.025506,0.012872,-0.005638
2,15694510,678,40.0,10,0.0,2,1.0,0.0,184866.69,0,-0.03608,-0.022063,-0.041293,-0.017429,-0.025288,-0.045381,-0.020503,-0.018278,0.009271,-0.01752
3,15741417,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,-0.035993,-0.027718,-0.04274,-0.007404,-0.036893,-0.040253,-0.020049,-0.061541,-0.094841,0.187114
4,15766172,716,33.0,5,0.0,2,1.0,1.0,15068.83,0,-0.028679,-0.031911,-0.02648,-0.012946,-0.040886,-0.05627,-0.057078,-0.039937,-0.11576,-0.184608


In [33]:
categorical_data.head()

Unnamed: 0,Geography,Gender
0,France,Male
1,France,Male
2,France,Male
3,France,Male
4,Spain,Male


### The surname is not really a categorical data, also it's really not important to our model so we will drop it

In [34]:
# categorical_data = categorical_data.drop("Surname", axis=1)
categorical_data.head()

Unnamed: 0,Geography,Gender
0,France,Male
1,France,Male
2,France,Male
3,France,Male
4,Spain,Male


In [35]:
corr = numeric_data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Surname_PCA_1,Surname_PCA_2,Surname_PCA_3,Surname_PCA_4,Surname_PCA_5,Surname_PCA_6,Surname_PCA_7,Surname_PCA_8,Surname_PCA_9,Surname_PCA_10
CustomerId,1.0,0.007299,0.003073,-0.002053,-0.008836,0.005178,-0.005875,-0.002693,0.003875,-0.009723,-0.006285,-0.005907,0.004168,-0.004853,-0.003892,0.000355,0.011352,0.001201,-0.003072,0.002623
CreditScore,0.007299,1.0,-0.008883,0.00095,0.005592,0.011605,-0.002607,0.015358,-0.000791,-0.027241,-0.001342,-0.009071,0.003092,-0.002629,-0.003638,0.001398,0.002059,-0.001351,-0.002525,0.00784
Age,0.003073,-0.008883,1.0,-0.010786,0.063271,-0.097297,-0.012604,0.00896,-0.006685,0.336606,0.002023,0.000723,-0.004606,0.002126,-0.002372,0.014422,-0.012156,0.012003,0.002036,-0.020328
Tenure,-0.002053,0.00095,-0.010786,1.0,-0.009642,0.007707,0.006412,-0.006868,0.001455,-0.019244,0.000847,-0.004675,-0.000868,0.000661,-0.00133,-0.000162,-0.002055,-0.008537,0.001811,0.002455
Balance,-0.008836,0.005592,0.063271,-0.009642,1.0,-0.357267,-0.020303,-0.014139,0.004402,0.128389,0.045519,-0.017527,-0.00852,-0.035192,0.021193,0.003588,-0.057309,0.077363,-0.007349,-0.095425
NumOfProducts,0.005178,0.011605,-0.097297,0.007707,-0.357267,1.0,0.005612,0.037827,-0.002422,-0.204509,-0.02314,-0.00049,-0.004777,0.00737,-0.015378,-0.002483,0.025332,-0.031445,-0.002615,0.039751
HasCrCard,-0.005875,-0.002607,-0.012604,0.006412,-0.020303,0.005612,1.0,-0.020682,0.004936,-0.021122,0.004969,0.000383,0.011776,0.000307,0.015779,-0.012173,-0.002101,0.002865,0.002677,0.012293
IsActiveMember,-0.002693,0.015358,0.00896,-0.006868,-0.014139,0.037827,-0.020682,1.0,-0.008713,-0.207205,0.009365,-0.000355,0.002848,-0.001705,0.000396,-0.00456,-0.0032,0.000749,-0.000191,0.008128
EstimatedSalary,0.003875,-0.000791,-0.006685,0.001455,0.004402,-0.002422,0.004936,-0.008713,1.0,0.018598,0.003864,0.004861,0.010351,0.006038,0.002442,-0.006435,0.012362,-0.015597,0.00805,0.016859
Exited,-0.009723,-0.027241,0.336606,-0.019244,0.128389,-0.204509,-0.021122,-0.207205,0.018598,1.0,0.003498,0.003526,-0.009445,-0.000144,0.0008,0.009495,-0.017709,0.014038,0.011305,-0.028088


***
### We can see here that there is a strong positive correlation between the Age and the churn, which means that when the age is larger, the possibility of churn is larger, which can tell us that as a person gets older it's more probable that they will chur.

### Also we can see a strong negative correlation this time between the (Number of Products and Is active member) with the exited probability, which tells us that when a person is more active the probability of them exiting is low, also when a client has more products in the bank, they are less probable to churn.

### From this we can see that these two variables are strongly important in the decision of the probability of a customer churn.
***

# **2. Data preprocessing**

***
### First we will deal with the categorical data, here we have five categorical variables: 
* Geography 
* Gender
* Tenure
* HasCrCard
* IsActiveMember

### The target value is practically categorical.
***

## **2.1. Label encoding for the categorical variables**

In [36]:
# df_train = df_train.drop("Surname", axis=1)

test_ids = df_test["id"]
# df_test = df_test.drop(["Surname", "id"], axis=1)
df_test = df_test.drop(["id"], axis=1)

In [37]:
df_test.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_1,Surname_PCA_2,Surname_PCA_3,Surname_PCA_4,Surname_PCA_5,Surname_PCA_6,Surname_PCA_7,Surname_PCA_8,Surname_PCA_9,Surname_PCA_10
0,15773898,586,France,Female,23.0,2,0.0,2,0.0,1.0,...,-0.024475,-0.025668,-0.04126,-0.006369,-0.029808,-0.027469,-0.003881,-0.034277,-0.041979,0.011074
1,15782418,683,France,Female,46.0,2,0.0,1,1.0,0.0,...,-0.023027,-0.015046,-0.021031,-0.001965,-0.00851,-0.007789,-0.002914,-0.011467,-0.009167,0.002774
2,15807120,656,France,Female,34.0,7,0.0,2,1.0,0.0,...,-0.019074,-0.012062,-0.016331,-0.001636,-0.005979,-0.005612,-0.001901,-0.007431,-0.006007,0.002427
3,15808905,681,France,Male,36.0,8,0.0,1,1.0,0.0,...,-0.026032,-0.017754,-0.025512,-0.003311,-0.012765,-0.00997,-0.003536,-0.01685,-0.011117,0.003933
4,15607314,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,...,-0.021392,-0.014037,-0.01972,-0.003233,-0.016113,0.014081,-0.001253,-0.005441,-0.005074,0.001711


In [38]:
df_train.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_1,Surname_PCA_2,Surname_PCA_3,Surname_PCA_4,Surname_PCA_5,Surname_PCA_6,Surname_PCA_7,Surname_PCA_8,Surname_PCA_9,Surname_PCA_10
0,15674932,668,France,Male,33.0,3,0.0,2,1.0,0.0,...,0.428648,0.055544,0.043289,0.003998,0.005236,0.008662,0.01816,0.006281,0.011951,-0.000572
1,15749177,627,France,Male,33.0,1,0.0,2,1.0,1.0,...,0.105775,0.002193,-0.00823,-0.002709,-0.01663,-0.022713,0.059528,0.025506,0.012872,-0.005638
2,15694510,678,France,Male,40.0,10,0.0,2,1.0,0.0,...,-0.03608,-0.022063,-0.041293,-0.017429,-0.025288,-0.045381,-0.020503,-0.018278,0.009271,-0.01752
3,15741417,581,France,Male,34.0,2,148882.54,1,1.0,1.0,...,-0.035993,-0.027718,-0.04274,-0.007404,-0.036893,-0.040253,-0.020049,-0.061541,-0.094841,0.187114
4,15766172,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,...,-0.028679,-0.031911,-0.02648,-0.012946,-0.040886,-0.05627,-0.057078,-0.039937,-0.11576,-0.184608


In [39]:
enc = LabelEncoder()

categorical_features = ["Geography", "Gender", "Tenure", "HasCrCard", "IsActiveMember"]

for cat_feat in categorical_features:
    df_train[cat_feat] = enc.fit_transform(df_train[cat_feat])
    df_test[cat_feat] = enc.transform(df_test[cat_feat])


In [40]:
df_train.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_1,Surname_PCA_2,Surname_PCA_3,Surname_PCA_4,Surname_PCA_5,Surname_PCA_6,Surname_PCA_7,Surname_PCA_8,Surname_PCA_9,Surname_PCA_10
0,15674932,668,0,1,33.0,3,0.0,2,1,0,...,0.428648,0.055544,0.043289,0.003998,0.005236,0.008662,0.01816,0.006281,0.011951,-0.000572
1,15749177,627,0,1,33.0,1,0.0,2,1,1,...,0.105775,0.002193,-0.00823,-0.002709,-0.01663,-0.022713,0.059528,0.025506,0.012872,-0.005638
2,15694510,678,0,1,40.0,10,0.0,2,1,0,...,-0.03608,-0.022063,-0.041293,-0.017429,-0.025288,-0.045381,-0.020503,-0.018278,0.009271,-0.01752
3,15741417,581,0,1,34.0,2,148882.54,1,1,1,...,-0.035993,-0.027718,-0.04274,-0.007404,-0.036893,-0.040253,-0.020049,-0.061541,-0.094841,0.187114
4,15766172,716,2,1,33.0,5,0.0,2,1,1,...,-0.028679,-0.031911,-0.02648,-0.012946,-0.040886,-0.05627,-0.057078,-0.039937,-0.11576,-0.184608


In [41]:
df_test.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_1,Surname_PCA_2,Surname_PCA_3,Surname_PCA_4,Surname_PCA_5,Surname_PCA_6,Surname_PCA_7,Surname_PCA_8,Surname_PCA_9,Surname_PCA_10
0,15773898,586,0,0,23.0,2,0.0,2,0,1,...,-0.024475,-0.025668,-0.04126,-0.006369,-0.029808,-0.027469,-0.003881,-0.034277,-0.041979,0.011074
1,15782418,683,0,0,46.0,2,0.0,1,1,0,...,-0.023027,-0.015046,-0.021031,-0.001965,-0.00851,-0.007789,-0.002914,-0.011467,-0.009167,0.002774
2,15807120,656,0,0,34.0,7,0.0,2,1,0,...,-0.019074,-0.012062,-0.016331,-0.001636,-0.005979,-0.005612,-0.001901,-0.007431,-0.006007,0.002427
3,15808905,681,0,1,36.0,8,0.0,1,1,0,...,-0.026032,-0.017754,-0.025512,-0.003311,-0.012765,-0.00997,-0.003536,-0.01685,-0.011117,0.003933
4,15607314,752,1,1,38.0,10,121263.62,1,1,0,...,-0.021392,-0.014037,-0.01972,-0.003233,-0.016113,0.014081,-0.001253,-0.005441,-0.005074,0.001711


### Now we will wrok on the numerical data, in this first version we will just scale the numerical data using a simple scaler, why do we need to scale it, well, as you can see, we can have values for a variable like 10000 and another value for another varibale like 0.38, usually the model will understand that the variable with 10000 is much more important than the variable with value 0.38, which is not necessary the case, so we need to center (or scale) the values to the same scale so that the proportion between them stays same but the huge gap is reduced around a fixed value.

### **UDAPTE: i did this because i was expecting that i will use a neural network, but it turned out i will not so scaling is not necessary here.**

## **2.2. Adding more features**

In [42]:
# by https://www.kaggle.com/code/chinmayadatt/notebook-analysing-bank-churn-dataset
def add_new_features(df):
    df['Geo_Gender'] = df['Geography'] + df['Gender'] + 10
    df['AgeGroup'] = df['Age'] // 10 * 10
    df['IsSenior'] = df['Age'].apply(lambda x: 1 if x >= 60 else 0)
    df['QualityOfBalance'] = pd.cut(df['Balance'], bins=[-1,100,1000,10000,50000,1000000], labels=['VeryLow', 'Low', 'Medium','High','Highest'])
    df['QualityOfBalance'].replace(['VeryLow', 'Low', 'Medium','High','Highest'],[0,1,2,3,4], inplace=True)
    df['Balance_to_Salary_Ratio'] = df['Balance'] / df['EstimatedSalary']
    df['CreditScoreTier'] = pd.cut(df['CreditScore'], bins=[0, 650, 750, 850], labels=['Low', 'Medium', 'High'])
    df['CreditScoreTier'].replace(['Low', 'Medium', 'High'],[0, 1, 2], inplace=True)
    df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']
    df['Products_Per_Tenure'] =  df['Tenure'] / df['NumOfProducts']
    df['Customer_Status'] = df['Tenure'].apply(lambda x:0 if x < 2 else 1)
    return df

In [43]:
df_train = add_new_features(df_train)
df_test  = add_new_features(df_test)

In [44]:
df_train.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_10,Geo_Gender,AgeGroup,IsSenior,QualityOfBalance,Balance_to_Salary_Ratio,CreditScoreTier,IsActive_by_CreditCard,Products_Per_Tenure,Customer_Status
0,15674932,668,0,1,33.0,3,0.0,2,1,0,...,-0.000572,11,30.0,0,0,0.0,1,0,1.5,1
1,15749177,627,0,1,33.0,1,0.0,2,1,1,...,-0.005638,11,30.0,0,0,0.0,0,1,0.5,0
2,15694510,678,0,1,40.0,10,0.0,2,1,0,...,-0.01752,11,40.0,0,0,0.0,1,0,5.0,1
3,15741417,581,0,1,34.0,2,148882.54,1,1,1,...,0.187114,11,30.0,0,4,1.760655,0,1,2.0,1
4,15766172,716,2,1,33.0,5,0.0,2,1,1,...,-0.184608,13,30.0,0,0,0.0,1,1,2.5,1


In [45]:
df_train = df_train.astype({
    'QualityOfBalance': int,
    'CreditScoreTier': int
})
df_test = df_test.astype({
    'QualityOfBalance': int,
    'CreditScoreTier': int
})

In [46]:
df_train.describe()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_10,Geo_Gender,AgeGroup,IsSenior,QualityOfBalance,Balance_to_Salary_Ratio,CreditScoreTier,IsActive_by_CreditCard,Products_Per_Tenure,Customer_Status
count,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,...,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0,175030.0
mean,15691940.0,656.114506,0.654134,0.563355,38.171301,5.019951,56677.296954,1.553071,0.751186,0.498749,...,1.0798400000000001e-17,11.217488,33.619494,0.030098,1.866966,2.202492,0.654059,0.370182,3.66733,0.867366
std,71428.44,81.148932,0.817515,0.495971,8.969551,2.811144,62982.318837,0.549211,0.432327,0.5,...,0.09717174,0.951383,9.457993,0.170857,1.994369,92.678906,0.673838,0.482855,2.523826,0.33918
min,15565700.0,350.0,0.0,0.0,18.0,0.0,0.0,1.0,0.0,0.0,...,-0.4391534,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15632880.0,597.0,0.0,0.0,32.0,3.0,0.0,1.0,1.0,0.0,...,-0.008665263,11.0,30.0,0.0,0.0,0.0,0.0,0.0,1.666667,1.0
50%,15690170.0,659.0,0.0,1.0,37.0,5.0,0.0,2.0,1.0,0.0,...,0.002029303,11.0,30.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0
75%,15756660.0,710.0,1.0,1.0,42.0,7.0,120727.97,2.0,1.0,1.0,...,0.004825201,12.0,40.0,0.0,4.0,1.000001,1.0,1.0,5.0,1.0
max,15815690.0,850.0,2.0,1.0,92.0,10.0,250898.09,4.0,1.0,1.0,...,0.7521332,13.0,90.0,1.0,4.0,12863.7962,2.0,1.0,10.0,1.0


In [47]:
# numeric_data      = df_train.drop(["Geography", "Gender", "Tenure", "HasCrCard", "IsActiveMember", "Geo_Gender", "IsSenior", "QualityOfBalance", "CreditScoreTier", "IsActive_by_CreditCard", "Products_Per_Tenure", "Customer_Status", "Exited"], axis=1)
# numeric_data_test = df_test.drop(["Geography", "Gender", "Tenure", "HasCrCard", "IsActiveMember", "Geo_Gender", "IsSenior", "QualityOfBalance", "CreditScoreTier", "IsActive_by_CreditCard", "Products_Per_Tenure", "Customer_Status"], axis=1)

# numeric_data      = numeric_data.drop("CustomerId", axis=1)
# numeric_data_test = numeric_data_test.drop("CustomerId", axis=1)
# numeric_data.head()

In [48]:
# numeric_data.head()

In [49]:
# numeric_data_test.head()

In [50]:
# scaler = StandardScaler()

# scaled_numerical_data_train = scaler.fit_transform(numeric_data)
# scaled_numerical_data_test  = scaler.transform(numeric_data_test)    

In [51]:
# scaled_numerical_data_train[0]

In [52]:
# pca_columns = [f'Surname_PCA_{i+1}' for i in range(10)]
# features = ["CreditScore", "Age", "Balance", "NumOfProducts", "EstimatedSalary"]
# for i in pca_columns:
#     features.append(i)

# for i, feat in enumerate(features):
#     l = []
    
#     for j in range(len(scaled_numerical_data_train)):
#         l.append(scaled_numerical_data_train[j][i])
    
#     df_train[feat] = l
    
#     l = []
    
#     for j in range(len(scaled_numerical_data_test)):
#         l.append(scaled_numerical_data_test[j][i])
    
#     df_test[feat]  = l

In [53]:
df_train.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_10,Geo_Gender,AgeGroup,IsSenior,QualityOfBalance,Balance_to_Salary_Ratio,CreditScoreTier,IsActive_by_CreditCard,Products_Per_Tenure,Customer_Status
0,15674932,668,0,1,33.0,3,0.0,2,1,0,...,-0.000572,11,30.0,0,0,0.0,1,0,1.5,1
1,15749177,627,0,1,33.0,1,0.0,2,1,1,...,-0.005638,11,30.0,0,0,0.0,0,1,0.5,0
2,15694510,678,0,1,40.0,10,0.0,2,1,0,...,-0.01752,11,40.0,0,0,0.0,1,0,5.0,1
3,15741417,581,0,1,34.0,2,148882.54,1,1,1,...,0.187114,11,30.0,0,4,1.760655,0,1,2.0,1
4,15766172,716,2,1,33.0,5,0.0,2,1,1,...,-0.184608,13,30.0,0,0,0.0,1,1,2.5,1


In [54]:
df_test.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_PCA_10,Geo_Gender,AgeGroup,IsSenior,QualityOfBalance,Balance_to_Salary_Ratio,CreditScoreTier,IsActive_by_CreditCard,Products_Per_Tenure,Customer_Status
0,15773898,586,0,0,23.0,2,0.0,2,0,1,...,0.011074,10,20.0,0,0,0.0,0,0,1.0,1
1,15782418,683,0,0,46.0,2,0.0,1,1,0,...,0.002774,10,40.0,0,0,0.0,1,0,2.0,1
2,15807120,656,0,0,34.0,7,0.0,2,1,0,...,0.002427,10,30.0,0,0,0.0,1,0,3.5,1
3,15808905,681,0,1,36.0,8,0.0,1,1,0,...,0.003933,11,30.0,0,0,0.0,1,0,8.0,1
4,15607314,752,1,1,38.0,10,121263.62,1,1,0,...,0.001711,12,30.0,0,4,0.869703,2,0,10.0,1


# **3. Modeling**

### **Alright, now that our data is ready to go, we need to make the best model to get the best result :)**

## **3.1. First we start by spliting the data into train and validation data**

In [55]:
X = df_train.drop("Exited", axis=1)
y = df_train["Exited"]
X_test = df_test

## **3.2. We make the models**

***
### We will go with the following models, then use a voting classifier:

* LightGBM
* XGboost
* CatBoost
***

In [56]:
lgbm = LGBMClassifier(**{  'objective'           : 'binary',
                           'boosting_type'       : 'gbdt',
                           'metric'              : "auc",
                           'random_state'        : 42,
                           'colsample_bytree'    : 0.56,
                           'subsample'           : 0.35,
                           'learning_rate'       : 0.05,
                           'max_depth'           : 8,
                           'n_estimators'        : 1000,
                           'num_leaves'          : 140,
                           'reg_alpha'           : 0.14,
                           'reg_lambda'          : 0.85,
                           'verbosity'           : -1, 
                          })
xgb  = XGBClassifier(**{  'objective'             : 'binary:logistic',
                          'eval_metric'           : "auc",
                          'random_state'          : 42,
                          'colsample_bytree'      : 0.25,
                          'learning_rate'         : 0.07,
                          'max_depth'             : 8,
                          'n_estimators'          : 800,                         
                          'reg_alpha'             : 0.09,
                          'reg_lambda'            : 0.70,
                          'min_child_weight'      : 22,
                          'verbosity'             : 0,
                         })
cat  = CatBoostClassifier(**{
                         'iterations'            : 10000,
                         'objective'             : 'Logloss',
                         'eval_metric'           : "AUC",
                         'early_stopping_rounds' : 1000,
                         'bagging_temperature'   : 0.1,
                         'colsample_bylevel'     : 0.88,
                         'iterations'            : 1000,
                         'learning_rate'         : 0.065,
                         'max_depth'             : 7,
                         'l2_leaf_reg'           : 1,
                         'min_data_in_leaf'      : 25,
                         'random_strength'       : 0.1, 
                         'max_bin'               : 100,
                         'verbose'               : 0,
                        })

vote = VotingClassifier(estimators=[('lgbm', lgbm), ('xgb', xgb), ('cat', cat)], voting='soft', weights=[2, 1, 1])

# Initialize an empty array to hold the submission predictions
submission_predictions = []

kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# save aucs
aucs = []
ind = 1

for train_index, test_index in kf.split(X, y):
    print(f"============== Working on fold #{ind} ================")
    X_train_kf, X_val_kf = X.iloc[train_index], X.iloc[test_index]
    y_train_kf, y_val_kf = y.iloc[train_index], y.iloc[test_index]

    print()
    print("               Fitting the voting model...              ")
    # Fit the model
    vote.fit(X_train_kf, y_train_kf)

    print()
    print("            Predicting on the validation data           ")
    # Predict probabilities for validation set
    y_pred_val = vote.predict_proba(X_val_kf)[:, 1]

    # Calculate AUC for validation set
    auc_val = roc_auc_score(y_val_kf, y_pred_val)
    print()
    print(f"           Validation ROC AUC Score: {auc_val}        ")
    
    aucs.append(auc_val)

    print()
    print("             Predicting on submission data...")
    # Predict probabilities for test set (df_test)
    y_pred_test = vote.predict_proba(X_test)[:, 1]
    submission_predictions.append(y_pred_test)
    
    print()
    print(f"                 Fold #{ind} finished !                ")
    
    ind+=1


               Fitting the voting model...              

            Predicting on the validation data           

           Validation ROC AUC Score: 0.8918608185354352        

             Predicting on submission data...

                 Fold #1 finished !                

               Fitting the voting model...              

            Predicting on the validation data           

           Validation ROC AUC Score: 0.8905346929543452        

             Predicting on submission data...

                 Fold #2 finished !                

               Fitting the voting model...              

            Predicting on the validation data           

           Validation ROC AUC Score: 0.8902450299890231        

             Predicting on submission data...

                 Fold #3 finished !                

               Fitting the voting model...              

            Predicting on the validation data           

           Validation ROC AUC Score: 0.8

In [57]:
print(f"Average ROC AUC Score: {sum(aucs) / len(aucs)}")

# Average predictions from different folds
avg_submission = pd.DataFrame(submission_predictions).mean(axis=0)

submission["Exited"] = avg_submission

# Save submission to CSV
submission.to_csv("submission.csv", index=False)

submission.head()

Average ROC AUC Score: 0.8905557155320591


Unnamed: 0,id,Exited
0,165034,0.015738
1,165035,0.861558
2,165036,0.025062
3,165037,0.240374
4,165038,0.317409


In [58]:
## **Cool, I will be trying to improve it, and I will keep changing the public notebook to help everyone understand as much as I can. Make sure to upvote if you find this helpful, Thanks :P**
