# XGBoost Classifier

## Part 1 - Data Preprocessing

### Importing the dataset

In [218]:
import pandas as pd
import numpy as np
df = pd.read_csv('churn_modelling.csv')

In [219]:
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Checking missing data

In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


 ### Feature Selection

In [221]:
df.drop(columns=['CustomerId','Surname'],axis=1,inplace=True)

In [222]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Feature Engineering

### Check Duplicated

In [223]:
df.duplicated().sum()

np.int64(0)

### Check Null Values

In [224]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [225]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Handling categorical variables

### Gender

In [226]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [227]:
df['Gender'] = df['Gender'].apply(lambda x : 1 if x == 'Male' else 0).astype(int)

In [228]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


### Geography

In [229]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [230]:
geography_dummies = pd.get_dummies(df['Geography'],drop_first=True).astype(int)

In [231]:
geography_dummies

Unnamed: 0,Germany,Spain
0,0,0
1,0,1
2,0,0
3,0,0
4,0,1
...,...,...
9995,0,0
9996,0,0
9997,0,0
9998,1,0


In [232]:
df = pd.concat([geography_dummies,df],axis=1)

In [233]:
df.head()

Unnamed: 0,Germany,Spain,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,0,1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,0,0,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,0,0,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,0,1,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [234]:
df.drop(columns=['Geography'],axis=1,inplace=True)

In [235]:
df.head()

Unnamed: 0,Germany,Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,0,619,0,42,2,0.0,1,1,1,101348.88,1
1,0,1,608,0,41,1,83807.86,1,0,1,112542.58,0
2,0,0,502,0,42,8,159660.8,3,1,0,113931.57,1
3,0,0,699,0,39,1,0.0,2,0,0,93826.63,0
4,0,1,850,0,43,2,125510.82,1,1,1,79084.1,0


### Getting the inputs and output

In [236]:
X = df.iloc[:,:-1].values

In [237]:
X

array([[0.0000000e+00, 0.0000000e+00, 6.1900000e+02, ..., 1.0000000e+00,
        1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 1.0000000e+00, 6.0800000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 1.1254258e+05],
       [0.0000000e+00, 0.0000000e+00, 5.0200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 1.1393157e+05],
       ...,
       [0.0000000e+00, 0.0000000e+00, 7.0900000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 4.2085580e+04],
       [1.0000000e+00, 0.0000000e+00, 7.7200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 9.2888520e+04],
       [0.0000000e+00, 0.0000000e+00, 7.9200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 3.8190780e+04]], shape=(10000, 11))

In [238]:
y = df.iloc[:,-1].values

In [239]:
y

array([1, 0, 1, ..., 1, 1, 0], shape=(10000,))

### Creating the Training Set and the Test Set

In [240]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Part 2 - Building and training the model

### Building the model

### Training the model

### Inference

### Predicting the result of a single observation

**Homework**

Use our model to predict if the customer with the following informations will leave the bank:

Geography: France

Credit Score: 600

Gender: Male

Age: 40 years old

Tenure: 3 years

Balance: \$ 60000

Number of Products: 2

Does this customer have a credit card? Yes

Is this customer an Active Member: Yes

Estimated Salary: \$ 50000

So, should we say goodbye to that customer?

## Part 3: Evaluating the model

### Accuracy

In [241]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

### Confusion Matrix

### Classfication Report

### k-Fold Cross Validation

## Train two models

In [242]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import xgboost
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
models = {
    "LogReg" : LogisticRegression(),
    "XGBoost": xgboost.XGBClassifier()
}
fitted = {name:mdl.fit(X_train,y_train) for name,mdl in models.items()}

### Evaluate

In [243]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [244]:

def evalute(clf,Xtr,ytr,Xte,yte):
     yhat_tr = clf.predict(Xtr)
     yhat_te = clf.predict(Xte) 
     metrics = {
        "acc_train": accuracy_score(ytr, yhat_tr),
        "acc_test":  accuracy_score(yte, yhat_te),
        "prec_test": precision_score(yte, yhat_te, zero_division=0),
        "rec_test":  recall_score(yte, yhat_te, zero_division=0),
        "f1_test":   f1_score(yte, yhat_te, zero_division=0)
    }
     return  metrics


results = {}
for name,clf in fitted.items():
     print(name)
     m = evalute(clf, X_train, y_train, X_test, y_test)
     results[name] = m
pd.DataFrame(results).T.sort_values("f1_test", ascending=False)     

LogReg
XGBoost


Unnamed: 0,acc_train,acc_test,prec_test,rec_test,f1_test
LogReg,0.8096,0.204,0.203206,0.996071,0.33755
XGBoost,0.961333,0.8164,0.857143,0.117878,0.207254
