https://www.kaggle.com/code/pjarbas/pipeline-with-imbalanced-data-smote-histgboost

Kisi-kisi ujian GC5:
- ambil data dari bigquery untuk membuat dataset_1 dan dataset_2.
- lakukan perbadingan dengan model LogisticRegression antara dataset_1 dan dataset_2
- Hasil yang terbaik, misalkan dataset_2. Maka lakukan hyperparameter tuning pada dataset_2.
- Kesimpulan


In [None]:
# Import Libraries

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE,SMOTENC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder

In [None]:

# Data Loading

df = pd.read_csv('https://raw.githubusercontent.com/FTDS-learning-materials/phase-1/master/w2/P1W2D4PM%20-%20Data%20Balancing%20-%20Churn%20Modelling.csv')
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [None]:
# Check Target

df.Exited.value_counts()

Unnamed: 0_level_0,count
Exited,Unnamed: 1_level_1
0,7963
1,2037


In [None]:
# Split Dataset

X = df.drop(['Exited'], axis = 1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=10)
y_train.value_counts()

Unnamed: 0_level_0,count
Exited,Unnamed: 1_level_1
0,5574
1,1426


In [None]:
# Feature Selection
# Let's assume the following columns have no correlation with the target : `RowNumber`, `CustomerId`, `Surname`, and `NumOfProducts`

X_train.drop(['RowNumber', 'CustomerId', 'Surname', 'NumOfProducts'], axis=1, inplace=True)
X_test.drop(['RowNumber', 'CustomerId', 'Surname', 'NumOfProducts'], axis=1, inplace=True)
X_train

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,HasCrCard,IsActiveMember,EstimatedSalary
8061,477,Spain,Male,31,9,0.00,0,1,184061.17
9987,606,Spain,Male,30,8,180307.73,1,1,1914.41
6425,793,France,Male,39,3,137817.52,0,0,83997.79
8223,483,France,Male,27,1,77805.66,1,1,2101.89
6148,611,France,Female,30,9,88594.14,1,0,196332.45
...,...,...,...,...,...,...,...,...,...
4113,667,France,Male,33,4,0.00,1,1,131834.75
628,649,France,Male,47,1,0.00,1,1,145593.85
850,646,Germany,Male,46,1,170826.55,1,0,45041.32
4112,700,France,Female,30,9,0.00,1,1,174971.64


In [None]:
# Split Between Numerical Columns and Categorical Columns

num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

print('Numeric Columns     : ', num_cols)
print('Categorical Columns : ', cat_cols)

Numeric Columns     :  ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
Categorical Columns :  ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']


In [None]:
X_train_num = X_train[num_cols].reset_index().drop(['index'], axis = 1)
X_train_cat = X_train[cat_cols].reset_index().drop(['index'], axis = 1)

In [None]:
X_train_num.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary
0,477,31,9,0.0,184061.17
1,606,30,8,180307.73,1914.41
2,793,39,3,137817.52,83997.79
3,483,27,1,77805.66,2101.89
4,611,30,9,88594.14,196332.45


In [None]:
X_train_cat.head()

Unnamed: 0,Geography,Gender,HasCrCard,IsActiveMember
0,Spain,Male,0,1
1,Spain,Male,1,1
2,France,Male,0,0
3,France,Male,1,1
4,France,Female,1,0


In [None]:
X_test_num = X_test[num_cols]
X_test_cat = X_test[cat_cols]

In [None]:
X_test_num.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary
7884,638,36,6,188455.19,47031.4
6682,640,29,5,197200.04,141453.62
5930,826,30,5,0.0,157397.57
3752,717,36,2,164557.95,82336.73
5754,714,45,9,106431.97,164117.69


In [None]:
# Numeric Scaling
sc = StandardScaler()
sc = sc.fit(X_train_num)

X_train_num = sc.transform(X_train_num)
X_test_num = sc.transform(X_test_num)

In [None]:
X_train_new = pd.DataFrame(data=X_train_num, columns=num_cols)
X_train_new = pd.concat([X_train_new, X_train_cat], axis=1)
X_train_new

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography,Gender,HasCrCard,IsActiveMember
0,-1.787792,-0.754076,1.374792,-1.209750,1.461023,Spain,Male,0,1
1,-0.453470,-0.849926,1.030084,1.666689,-1.710029,Spain,Male,1,1
2,1.480780,0.012721,-0.693453,0.988845,-0.281013,France,Male,0,0
3,-1.725731,-1.137475,-1.382868,0.031479,-1.706765,France,Male,1,1
4,-0.401752,-0.849926,1.374792,0.203587,1.674657,France,Female,1,0
...,...,...,...,...,...,...,...,...,...
6995,0.177488,-0.562377,-0.348745,-1.209750,0.551796,France,Male,1,1
6996,-0.008696,0.779518,-1.382868,-1.209750,0.791333,France,Male,1,1
6997,-0.039727,0.683668,-1.382868,1.515436,-0.959219,Germany,Male,1,0
6998,0.518827,-0.849926,1.374792,-1.209750,1.302780,France,Female,1,1


In [None]:
smotenc = SMOTENC([5, 6, 7, 8], random_state = 38)
X_train_balanced, y_train_balanced = smotenc.fit_resample(X_train_new, y_train)
y_train_balanced.value_counts()

Unnamed: 0_level_0,count
Exited,Unnamed: 1_level_1
0,5574
1,5574


In [None]:
X_train_balanced.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography,Gender,HasCrCard,IsActiveMember
0,-1.787792,-0.754076,1.374792,-1.20975,1.461023,Spain,Male,0,1
1,-0.45347,-0.849926,1.030084,1.666689,-1.710029,Spain,Male,1,1
2,1.48078,0.012721,-0.693453,0.988845,-0.281013,France,Male,0,0
3,-1.725731,-1.137475,-1.382868,0.031479,-1.706765,France,Male,1,1
4,-0.401752,-0.849926,1.374792,0.203587,1.674657,France,Female,1,0


In [None]:
OneHotEncoder?

Male, Female, TG

In [None]:
ohe = OneHotEncoder()
ohe = ohe.fit(X_train_balanced[cat_cols])


X_train_balanced_cat = ohe.transform(X_train_balanced[cat_cols])
X_test_cat = ohe.transform(X_test_cat)

In [None]:
X_train_balanced_cat

<11148x9 sparse matrix of type '<class 'numpy.float64'>'
	with 44592 stored elements in Compressed Sparse Row format>

In [None]:
X_train_balanced_cat= X_train_balanced_cat.toarray()
X_test_cat = X_test_cat.toarray()

X_train_balanced_final = np.concatenate([X_train_balanced[num_cols], X_train_balanced_cat], axis=1)
X_test_final_v2 = np.concatenate([X_test_num, X_test_cat], axis=1)

In [None]:
X_train_balanced_final

array([[-1.78779213, -0.7540764 ,  1.37479182, ...,  0.        ,
         0.        ,  1.        ],
       [-0.45347003, -0.84992603,  1.03008437, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.48077984,  0.01272061, -0.69345291, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.02345094,  1.64216427,  0.33740253, ...,  1.        ,
         1.        ,  0.        ],
       [ 1.53048083,  0.84235077, -1.09752956, ...,  1.        ,
         1.        ,  0.        ],
       [-0.30102067,  0.69029905, -0.34874546, ...,  1.        ,
         1.        ,  0.        ]])

In [None]:
X_test_final_v2

array([[-0.1224754 , -0.27482827,  0.34066945, ...,  0.        ,
         1.        ,  0.        ],
       [-0.10178824, -0.94577565, -0.004038  , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.82211805, -0.84992603, -0.004038  , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.16384973, -0.37067789,  1.71949928, ...,  1.        ,
         1.        ,  0.        ],
       [ 1.16012879, -1.23332453,  0.68537691, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.30493894, -0.27482827, -1.38286783, ...,  1.        ,
         1.        ,  0.        ]])

In [None]:
lr_2 = LogisticRegression()
lr_2.fit(X_train_balanced_final, y_train_balanced)

In [None]:
# Check Results - with Handling

print(classification_report(y_train_balanced, lr_2.predict(X_train_balanced_final)))
print(classification_report(y_test, lr_2.predict(X_test_final_v2)))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72      5574
           1       0.72      0.72      0.72      5574

    accuracy                           0.72     11148
   macro avg       0.72      0.72      0.72     11148
weighted avg       0.72      0.72      0.72     11148

              precision    recall  f1-score   support

           0       0.90      0.72      0.80      2389
           1       0.39      0.69      0.50       611

    accuracy                           0.71      3000
   macro avg       0.64      0.71      0.65      3000
weighted avg       0.80      0.71      0.74      3000



# GC5

# dataset 1
- data loading
-  EDA
- preprocessing

# dataset 2
- data loading
-  EDA
- preprocessing

# loading data

- load data1
- load data2

# EDA