# LightGBM Classifier

## Part 1 - Data Preprocessing

### Importing the dataset

In [72]:
import pandas as pd
import numpy as np
df = pd.read_csv('churn_modelling.csv')

In [73]:
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Checking missing data

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


 ### Feature Selection

In [75]:
df.drop(columns=['CustomerId','Surname'],axis=1,inplace=True)

In [76]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Feature Engineering

### Check Duplicated

In [77]:
df.duplicated().sum()

np.int64(0)

### Check Null Values

In [78]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [79]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Handling categorical variables

### Gender

In [80]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [81]:
df['Gender'] = df['Gender'].apply(lambda x : 1 if x == 'Male' else 0).astype(int)

In [82]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


### Geography

In [83]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [84]:
geography_dummies = pd.get_dummies(df['Geography'],drop_first=True).astype(int)

In [85]:
geography_dummies

Unnamed: 0,Germany,Spain
0,0,0
1,0,1
2,0,0
3,0,0
4,0,1
...,...,...
9995,0,0
9996,0,0
9997,0,0
9998,1,0


In [86]:
df = pd.concat([geography_dummies,df],axis=1)

In [87]:
df.head()

Unnamed: 0,Germany,Spain,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,0,1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,0,0,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,0,0,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,0,1,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [88]:
df.drop(columns=['Geography'],axis=1,inplace=True)

In [89]:
df.head()

Unnamed: 0,Germany,Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,0,619,0,42,2,0.0,1,1,1,101348.88,1
1,0,1,608,0,41,1,83807.86,1,0,1,112542.58,0
2,0,0,502,0,42,8,159660.8,3,1,0,113931.57,1
3,0,0,699,0,39,1,0.0,2,0,0,93826.63,0
4,0,1,850,0,43,2,125510.82,1,1,1,79084.1,0


### Getting the inputs and output

In [90]:
X = df.iloc[:,:-1].values

In [91]:
X

array([[0.0000000e+00, 0.0000000e+00, 6.1900000e+02, ..., 1.0000000e+00,
        1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 1.0000000e+00, 6.0800000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 1.1254258e+05],
       [0.0000000e+00, 0.0000000e+00, 5.0200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 1.1393157e+05],
       ...,
       [0.0000000e+00, 0.0000000e+00, 7.0900000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 4.2085580e+04],
       [1.0000000e+00, 0.0000000e+00, 7.7200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 9.2888520e+04],
       [0.0000000e+00, 0.0000000e+00, 7.9200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 3.8190780e+04]], shape=(10000, 11))

In [92]:
y = df.iloc[:,-1].values

In [93]:
y

array([1, 0, 1, ..., 1, 1, 0], shape=(10000,))

### Creating the Training Set and the Test Set

In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Part 2 - Building and training the model

### Building the model

In [None]:
import lightgbm as lg
model = lg.LGBMClassifier()


### Training the model

In [96]:
model.fit(X_train,y_train)

Learning rate set to 0.024355
0:	learn: 0.6733475	total: 149ms	remaining: 2m 28s
1:	learn: 0.6543691	total: 155ms	remaining: 1m 17s
2:	learn: 0.6366670	total: 163ms	remaining: 54.2s
3:	learn: 0.6203603	total: 171ms	remaining: 42.7s
4:	learn: 0.6049544	total: 178ms	remaining: 35.5s
5:	learn: 0.5914731	total: 184ms	remaining: 30.5s
6:	learn: 0.5790907	total: 187ms	remaining: 26.5s
7:	learn: 0.5672645	total: 192ms	remaining: 23.8s
8:	learn: 0.5566303	total: 195ms	remaining: 21.5s
9:	learn: 0.5451642	total: 199ms	remaining: 19.7s
10:	learn: 0.5336192	total: 203ms	remaining: 18.3s
11:	learn: 0.5252851	total: 207ms	remaining: 17s
12:	learn: 0.5151217	total: 211ms	remaining: 16s
13:	learn: 0.5067197	total: 214ms	remaining: 15.1s
14:	learn: 0.4980974	total: 218ms	remaining: 14.3s
15:	learn: 0.4889298	total: 221ms	remaining: 13.6s
16:	learn: 0.4825595	total: 225ms	remaining: 13s
17:	learn: 0.4750413	total: 228ms	remaining: 12.5s
18:	learn: 0.4683393	total: 231ms	remaining: 12s
19:	learn: 0.4618

<catboost.core.CatBoostClassifier at 0x1bac4f27470>

### Inference

In [97]:
y_pred = model.predict(X_test)

In [98]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(2500,))

In [99]:
y_test

array([0, 1, 0, ..., 0, 0, 0], shape=(2500,))

### Predicting the result of a single observation

**Homework**

Use our model to predict if the customer with the following informations will leave the bank:

Geography: France

Credit Score: 600

Gender: Male

Age: 40 years old

Tenure: 3 years

Balance: \$ 60000

Number of Products: 2

Does this customer have a credit card? Yes

Is this customer an Active Member: Yes

Estimated Salary: \$ 50000

So, should we say goodbye to that customer?

In [100]:
model.predict([[0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])

array([0])

## Part 3: Evaluating the model

### Accuracy

In [101]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [102]:
accuracy_score(y_test,y_pred)

0.866

### Confusion Matrix

In [103]:
confusion_matrix(y_test,y_pred)

array([[1897,   94],
       [ 241,  268]])

### Classfication Report

In [104]:
cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1991
           1       0.74      0.53      0.62       509

    accuracy                           0.87      2500
   macro avg       0.81      0.74      0.77      2500
weighted avg       0.86      0.87      0.86      2500



### k-Fold Cross Validation

In [105]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=model,X=X,y=y,scoring='accuracy',cv=10)
print("Average Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Learning rate set to 0.026327
0:	learn: 0.6703902	total: 5.48ms	remaining: 5.47s
1:	learn: 0.6499984	total: 10.2ms	remaining: 5.09s
2:	learn: 0.6310487	total: 14.8ms	remaining: 4.92s
3:	learn: 0.6135907	total: 19.5ms	remaining: 4.86s
4:	learn: 0.5971020	total: 25ms	remaining: 4.97s
5:	learn: 0.5822385	total: 29ms	remaining: 4.81s
6:	learn: 0.5692620	total: 32.2ms	remaining: 4.57s
7:	learn: 0.5569341	total: 36.4ms	remaining: 4.51s
8:	learn: 0.5461525	total: 40.5ms	remaining: 4.46s
9:	learn: 0.5342983	total: 44.1ms	remaining: 4.37s
10:	learn: 0.5224825	total: 47.7ms	remaining: 4.29s
11:	learn: 0.5139173	total: 51.7ms	remaining: 4.25s
12:	learn: 0.5036763	total: 55.7ms	remaining: 4.23s
13:	learn: 0.4934586	total: 59.7ms	remaining: 4.21s
14:	learn: 0.4839903	total: 63.6ms	remaining: 4.17s
15:	learn: 0.4768630	total: 67.1ms	remaining: 4.13s
16:	learn: 0.4707887	total: 70.9ms	remaining: 4.1s
17:	learn: 0.4633973	total: 74.1ms	remaining: 4.04s
18:	learn: 0.4566957	total: 77.4ms	remaining: 4s


### GridSearch

In [106]:
from sklearn.model_selection import GridSearchCV
# boosting_type: str = "gbdt", 
# num_leaves: int = 31, 
# max_depth: int = -1, 
# learning_rate: float = 0.1, 
# n_estimators: int = 100,
parameters  = [{'num_leaves':[29,30,31,32,33],'learning_rate':[0.08,0.09,0.1,0.11,0.12],'n_estimators':[80,90,100,110,120]}]
grid_search = GridSearchCV(
    estimator=model,
    param_grid=parameters,
    scoring='accuracy',
    cv=10)
grid_search.fit(X,y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

0:	learn: 0.6528321	total: 3.71ms	remaining: 293ms
1:	learn: 0.6189263	total: 6.3ms	remaining: 246ms
2:	learn: 0.5893487	total: 9.63ms	remaining: 247ms
3:	learn: 0.5639281	total: 12.7ms	remaining: 240ms
4:	learn: 0.5404746	total: 16.3ms	remaining: 244ms
5:	learn: 0.5195364	total: 19.7ms	remaining: 243ms
6:	learn: 0.5010493	total: 22.6ms	remaining: 236ms
7:	learn: 0.4864554	total: 25.9ms	remaining: 233ms
8:	learn: 0.4725516	total: 29.3ms	remaining: 231ms
9:	learn: 0.4593704	total: 32.2ms	remaining: 225ms
10:	learn: 0.4479743	total: 35.5ms	remaining: 223ms
11:	learn: 0.4374089	total: 38.2ms	remaining: 217ms
12:	learn: 0.4286671	total: 40.8ms	remaining: 210ms
13:	learn: 0.4196287	total: 43.4ms	remaining: 205ms
14:	learn: 0.4123517	total: 46.3ms	remaining: 200ms
15:	learn: 0.4054304	total: 48.9ms	remaining: 195ms
16:	learn: 0.3993578	total: 51.7ms	remaining: 192ms
17:	learn: 0.3944705	total: 54.3ms	remaining: 187ms
18:	learn: 0.3898928	total: 56.6ms	remaining: 182ms
19:	learn: 0.3846591	to

KeyboardInterrupt: 

In [None]:
print("Best R-Squared: {:.2f} %".format(best_accuracy * 100))
print("Best Parameters:",best_parameters)

Best R-Squared: 86.48 %
Best Parameters: {'learning_rate': 0.09, 'n_estimators': 90, 'num_leaves': 32}
