# Catboost Classifier

## Part 1 - Data Preprocessing

### Importing the dataset

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('churn_modelling.csv')

In [2]:
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Checking missing data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


 ### Feature Selection

In [4]:
df.drop(columns=['CustomerId','Surname'],axis=1,inplace=True)

In [5]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Feature Engineering

### Check Duplicated

In [6]:
df.duplicated().sum()

np.int64(0)

### Check Null Values

In [7]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [8]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Handling categorical variables

### Gender

In [9]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [10]:
df['Gender'] = df['Gender'].apply(lambda x : 1 if x == 'Male' else 0).astype(int)

In [11]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


### Geography

In [12]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [13]:
geography_dummies = pd.get_dummies(df['Geography'],drop_first=True).astype(int)

In [14]:
geography_dummies

Unnamed: 0,Germany,Spain
0,0,0
1,0,1
2,0,0
3,0,0
4,0,1
...,...,...
9995,0,0
9996,0,0
9997,0,0
9998,1,0


In [15]:
df = pd.concat([geography_dummies,df],axis=1)

In [16]:
df.head()

Unnamed: 0,Germany,Spain,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,0,1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,0,0,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,0,0,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,0,1,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [17]:
df.drop(columns=['Geography'],axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0,Germany,Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,0,619,0,42,2,0.0,1,1,1,101348.88,1
1,0,1,608,0,41,1,83807.86,1,0,1,112542.58,0
2,0,0,502,0,42,8,159660.8,3,1,0,113931.57,1
3,0,0,699,0,39,1,0.0,2,0,0,93826.63,0
4,0,1,850,0,43,2,125510.82,1,1,1,79084.1,0


### Getting the inputs and output

In [19]:
X = df.iloc[:,:-1].values

In [20]:
X

array([[0.0000000e+00, 0.0000000e+00, 6.1900000e+02, ..., 1.0000000e+00,
        1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 1.0000000e+00, 6.0800000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 1.1254258e+05],
       [0.0000000e+00, 0.0000000e+00, 5.0200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 1.1393157e+05],
       ...,
       [0.0000000e+00, 0.0000000e+00, 7.0900000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 4.2085580e+04],
       [1.0000000e+00, 0.0000000e+00, 7.7200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 9.2888520e+04],
       [0.0000000e+00, 0.0000000e+00, 7.9200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 3.8190780e+04]], shape=(10000, 11))

In [21]:
y = df.iloc[:,-1].values

In [22]:
y

array([1, 0, 1, ..., 1, 1, 0], shape=(10000,))

### Creating the Training Set and the Test Set

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Part 2 - Building and training the model

### Building the model

In [24]:
import catboost as cb
model = cb.CatBoostClassifier()


### Training the model

In [25]:
model.fit(X_train,y_train)

Learning rate set to 0.024355
0:	learn: 0.6733475	total: 136ms	remaining: 2m 16s
1:	learn: 0.6543691	total: 140ms	remaining: 1m 9s
2:	learn: 0.6366670	total: 145ms	remaining: 48s
3:	learn: 0.6203603	total: 149ms	remaining: 37.1s
4:	learn: 0.6049544	total: 153ms	remaining: 30.4s
5:	learn: 0.5914731	total: 158ms	remaining: 26.1s
6:	learn: 0.5790907	total: 162ms	remaining: 23s
7:	learn: 0.5672645	total: 167ms	remaining: 20.7s
8:	learn: 0.5566303	total: 171ms	remaining: 18.8s
9:	learn: 0.5451642	total: 176ms	remaining: 17.4s
10:	learn: 0.5336192	total: 180ms	remaining: 16.2s
11:	learn: 0.5252851	total: 184ms	remaining: 15.2s
12:	learn: 0.5151217	total: 187ms	remaining: 14.2s
13:	learn: 0.5067197	total: 191ms	remaining: 13.4s
14:	learn: 0.4980974	total: 194ms	remaining: 12.7s
15:	learn: 0.4889298	total: 197ms	remaining: 12.1s
16:	learn: 0.4825595	total: 199ms	remaining: 11.5s
17:	learn: 0.4750413	total: 203ms	remaining: 11.1s
18:	learn: 0.4683393	total: 206ms	remaining: 10.7s
19:	learn: 0.4

<catboost.core.CatBoostClassifier at 0x2aacefddd30>

### Inference

In [26]:
y_pred = model.predict(X_test)

In [27]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(2500,))

In [28]:
y_test

array([0, 1, 0, ..., 0, 0, 0], shape=(2500,))

### Predicting the result of a single observation

**Homework**

Use our model to predict if the customer with the following informations will leave the bank:

Geography: France

Credit Score: 600

Gender: Male

Age: 40 years old

Tenure: 3 years

Balance: \$ 60000

Number of Products: 2

Does this customer have a credit card? Yes

Is this customer an Active Member: Yes

Estimated Salary: \$ 50000

So, should we say goodbye to that customer?

In [29]:
model.predict([[0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])

array([0])

## Part 3: Evaluating the model

### Accuracy

In [30]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [31]:
accuracy_score(y_test,y_pred)

0.866

### Confusion Matrix

In [32]:
confusion_matrix(y_test,y_pred)

array([[1897,   94],
       [ 241,  268]])

### Classfication Report

In [33]:
cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1991
           1       0.74      0.53      0.62       509

    accuracy                           0.87      2500
   macro avg       0.81      0.74      0.77      2500
weighted avg       0.86      0.87      0.86      2500



### k-Fold Cross Validation

In [34]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=model,X=X,y=y,scoring='accuracy',cv=10)
print("Average Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Learning rate set to 0.026327
0:	learn: 0.6703902	total: 3.48ms	remaining: 3.48s
1:	learn: 0.6499984	total: 6.7ms	remaining: 3.34s
2:	learn: 0.6310487	total: 10.4ms	remaining: 3.45s
3:	learn: 0.6135907	total: 14.3ms	remaining: 3.56s
4:	learn: 0.5971020	total: 17.7ms	remaining: 3.52s
5:	learn: 0.5822385	total: 21.5ms	remaining: 3.56s
6:	learn: 0.5692620	total: 24.2ms	remaining: 3.43s
7:	learn: 0.5569341	total: 28ms	remaining: 3.47s
8:	learn: 0.5461525	total: 31.5ms	remaining: 3.47s
9:	learn: 0.5342983	total: 35.1ms	remaining: 3.48s
10:	learn: 0.5224825	total: 38.9ms	remaining: 3.5s
11:	learn: 0.5139173	total: 42.6ms	remaining: 3.51s
12:	learn: 0.5036763	total: 46.6ms	remaining: 3.53s
13:	learn: 0.4934586	total: 50.4ms	remaining: 3.55s
14:	learn: 0.4839903	total: 54.3ms	remaining: 3.57s
15:	learn: 0.4768630	total: 58.4ms	remaining: 3.59s
16:	learn: 0.4707887	total: 62.3ms	remaining: 3.6s
17:	learn: 0.4633973	total: 66ms	remaining: 3.6s
18:	learn: 0.4566957	total: 69.8ms	remaining: 3.6s
1

### GridSearch

In [35]:
from sklearn.model_selection import GridSearchCV
# boosting_type: str = "gbdt", 
# num_leaves: int = 31, 
# max_depth: int = -1, 
# learning_rate: float = 0.1, 
# n_estimators: int = 100,
parameters  = [{'num_leaves':[29,30,31,32,33],'learning_rate':[0.08,0.09,0.1,0.11,0.12],'n_estimators':[80,90,100,110,120]}]
grid_search = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    scoring='accuracy',
    cv=10)
grid_search.fit(X,y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

0:	learn: 0.6528321	total: 4.21ms	remaining: 332ms
1:	learn: 0.6189263	total: 7.72ms	remaining: 301ms
2:	learn: 0.5893487	total: 11.7ms	remaining: 300ms
3:	learn: 0.5639281	total: 15.1ms	remaining: 286ms
4:	learn: 0.5404746	total: 18.3ms	remaining: 275ms
5:	learn: 0.5195364	total: 22.6ms	remaining: 278ms
6:	learn: 0.5010493	total: 26.8ms	remaining: 279ms
7:	learn: 0.4864554	total: 31ms	remaining: 279ms
8:	learn: 0.4725516	total: 35ms	remaining: 276ms
9:	learn: 0.4593704	total: 39.3ms	remaining: 275ms
10:	learn: 0.4479743	total: 42.5ms	remaining: 267ms
11:	learn: 0.4374089	total: 45.5ms	remaining: 258ms
12:	learn: 0.4286671	total: 48.6ms	remaining: 251ms
13:	learn: 0.4196287	total: 51.7ms	remaining: 244ms
14:	learn: 0.4123517	total: 54.9ms	remaining: 238ms
15:	learn: 0.4054304	total: 59ms	remaining: 236ms
16:	learn: 0.3993578	total: 61.4ms	remaining: 228ms
17:	learn: 0.3944705	total: 64.2ms	remaining: 221ms
18:	learn: 0.3898928	total: 67.1ms	remaining: 216ms
19:	learn: 0.3846591	total: 

1000 fits failed out of a total of 1250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1000 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ganesh10\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ganesh10\AppData\Roaming\Python\Python312\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\ganesh10\AppData\Roaming\Python\Python312\site-packages\catboost\core.py", line 2395, in _fit
    train_params = s

0:	learn: 0.6335097	total: 3.46ms	remaining: 308ms
1:	learn: 0.5843932	total: 6.07ms	remaining: 267ms
2:	learn: 0.5469787	total: 8.69ms	remaining: 252ms
3:	learn: 0.5150059	total: 11.2ms	remaining: 240ms
4:	learn: 0.4891097	total: 13.7ms	remaining: 232ms
5:	learn: 0.4660763	total: 16.4ms	remaining: 230ms
6:	learn: 0.4477879	total: 19.6ms	remaining: 233ms
7:	learn: 0.4334872	total: 22.7ms	remaining: 232ms
8:	learn: 0.4209810	total: 25.5ms	remaining: 230ms
9:	learn: 0.4091561	total: 28.3ms	remaining: 226ms
10:	learn: 0.3997344	total: 31.2ms	remaining: 224ms
11:	learn: 0.3904783	total: 34.3ms	remaining: 223ms
12:	learn: 0.3832448	total: 37.4ms	remaining: 221ms
13:	learn: 0.3766667	total: 40.5ms	remaining: 220ms
14:	learn: 0.3709480	total: 43.8ms	remaining: 219ms
15:	learn: 0.3661770	total: 47.1ms	remaining: 218ms
16:	learn: 0.3618400	total: 50.5ms	remaining: 217ms
17:	learn: 0.3589316	total: 53.6ms	remaining: 214ms
18:	learn: 0.3565182	total: 56.6ms	remaining: 211ms
19:	learn: 0.3541030	t

In [36]:
print("Best R-Squared: {:.2f} %".format(best_accuracy * 100))
print("Best Parameters:",best_parameters)

Best R-Squared: 86.50 %
Best Parameters: {'learning_rate': 0.12, 'n_estimators': 90, 'num_leaves': 31}
