# Introduction

Kita akan menggunakan [Bank Marketing Dataset](https://www.kaggle.com/datasets/janiobachmann/bank-marketing-dataset) dari Kaggle. 

In [None]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


#plt.style.use('seaborn')

# Data Preprocessing


## Loading data

Dokumentasi dataset dapat dilihat [di sini](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing) 

Deskripsi kolom adalah sebagai berikut:

Variabel input :
1. age (numeric)
2. job : type of job (categorical: `'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown'`)
3. marital : marital status (categorical: `'divorced','married','single','unknown'`; note: `'divorced'` means divorced or widowed)
4. education (categorical: `'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown'`)
5. default: has credit in default? (categorical: `'no','yes','unknown'`)
6. housing: has housing loan? (categorical: `'no','yes','unknown'`)
7. loan: has personal loan? (categorical: `'no','yes','unknown'`)
8. contact: contact communication type (categorical: `'cellular','telephone'`)
9. month: last contact month of year (categorical: `'jan', 'feb', 'mar', ..., 'nov', 'dec'`)
10. day_of_week: last contact day of the week (categorical: `'mon','tue','wed','thu','fri'`)
11. duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
12. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14. previous: number of contacts performed before this campaign and for this client (numeric)
15. poutcome: outcome of the previous marketing campaign (categorical: `'failure','nonexistent','success'`)

Target :
21. deposit. has the client subscribed a term deposit? (binary: 'yes','no')

In [None]:
df_bank = pd.read_csv('https://raw.githubusercontent.com/urfie/DataAnalytics/main/bank.csv')

print('Dataframe shape:', df_bank.shape)
df_bank.head()

##Data Cleansing

In [None]:
df_bank = df_bank.drop('duration', axis=1)

print('Dataframe shape:', df_bank.shape)
df_bank.head()

##Transformasi variabel kategorik

###Transform input features

In [None]:
enc = OneHotEncoder(handle_unknown='ignore') 
cat_cols = ['job','marital','education','default','housing','loan','contact','day','month','poutcome']

In [None]:
# contoh one hot encoding untuk kolom marital status
encoded = enc.fit_transform(df_bank[['marital']]).toarray() #encode dan ubah menjadi array

Ubah seluruh variabel kategorik : 

In [None]:
cat_cols.remove('marital')

for col in cat_cols:
  encoded = np.concatenate([encoded, enc.fit_transform(df_bank[[col]]).toarray()], axis = 1) 

Menggabungkan dengan variabel-variabel numerik

In [None]:
num_cols = ['age','balance','campaign','pdays']#,'previous']

numeric_columns = np.array(df_bank[num_cols])

encoded = np.concatenate([encoded, numeric_columns], axis = 1)

###Transform target

Kita gunakan `labelBinarizer` untuk mengubah kolom target dari `'yes'/'no'` menjadi `1/0`

In [None]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
encoded_target = lb.fit_transform(df_bank[['deposit']]).ravel()

##Split dataset ke dalam train-test set



In [None]:
X_train, X_test, y_train, y_test = train_test_split(encoded , encoded_target, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=1)

# Show the Training and Testing Data
print('Dimensi training feature:', X_train.shape)
print('Dimensi testing feature:', X_test.shape)
print('Dimensi training target:', y_train.shape)
print('Dimensi training target:', y_test.shape)

##Penskalaan variabel numerik

Decision tree dan random forest termasuk algoritma yang robust terhadap perbedaan skala fitur, sehingga kita tidak perlu melakukan penskalaan variabel numerik.

In [None]:
#scaler = StandardScaler().fit(X_train[:,75:]) 

#X_train[:,75:] = scaler.transform(X_train[:,75:])
#X_test[:,75:] = scaler.transform(X_test[:,75:])

##Melatih model : Decision Tree

Kita akan menggunakan model `decision tree` untuk melakukan prediksi. Decision Tree ada dalam package `tree`

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier().fit(X_train, y_train)

##Evaluasi Model



In [None]:
from sklearn import metrics


# Lakukan prediksi terhadap test set
y_pred = tree.predict(X_test)

# Hitung akurasi, presisi, recall, dan f1-score
print('Akurasi:', metrics.accuracy_score(y_test, y_pred))
print('Presisi:', metrics.precision_score(y_test, y_pred))
print('Recall:', metrics.recall_score(y_test, y_pred))
print('F1 Score:', metrics.f1_score(y_test, y_pred))



In [None]:
# Display confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
from sklearn.metrics import ConfusionMatrixDisplay
print('Confusion Matrix:\n', cm)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
disp.plot()
plt.show()

Plot kurva ROC untuk melihat kinerja model pada tiap-tiap threshold, sebagai fungsi FPR-TPR

In [None]:
#Prediksi kelas dan nilai probabilitas tiap kelas
y_proba = tree.predict_proba(X_test)

fpr, tpr, thresh = metrics.roc_curve(y_test, y_proba[:, 1])
roc_auc = metrics.auc(fpr, tpr)

display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                   estimator_name='Decision Tree')
display.plot()
plt.plot([0, 1], [0, 1], color = 'g')
plt.show()

##Melatih Model : Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(oob_score=True)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

##Evaluasi Model

In [None]:
# Lakukan prediksi terhadap test set
y_pred = rf.predict(X_test)

# Hitung akurasi, presisi, recall, dan f1-score
print('Akurasi:', metrics.accuracy_score(y_test, y_pred))
print('Presisi:', metrics.precision_score(y_test, y_pred))
print('Recall:', metrics.recall_score(y_test, y_pred))
print('F1 Score:', metrics.f1_score(y_test, y_pred))

print('OOB Score:', rf.oob_score_)


In [None]:
# Display confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)
disp.plot()
plt.show()

In [None]:
#Prediksi kelas dan nilai probabilitas tiap kelas
y_proba = rf.predict_proba(X_test)

fpr, tpr, thresh = metrics.roc_curve(y_test, y_proba[::, 1])
roc_auc = metrics.auc(fpr, tpr)

auc = metrics.roc_auc_score(y_test, y_proba[::, 1])

display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                   estimator_name='Random Forest')
display.plot()
plt.plot([0, 1], [0, 1], color = 'g')
plt.show()