In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def importData(filename):
    # Read Data
    data = pd.read_csv(filename,delimiter=';')
    print(f"shape awal                    : {data.shape}, (#observasi, #fitur)")

    # Drop duplicate
    data = data.drop_duplicates()
    print(f"shape setelah drop duplikat   : {data.shape}, (#observasi, #fitur)")

    return data

In [3]:
filename = "data/bank-additional-full.csv"
data = importData(filename = filename)

data.head()

shape awal                    : (41188, 21), (#observasi, #fitur)
shape setelah drop duplikat   : (41176, 21), (#observasi, #fitur)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [18]:
indexToDrop = data[ (data['marital'] == 'unknown') 
                   | (data['job'] == 'unknown') 
                   | (data['housing'] == 'unknown') 
                   | (data['loan'] == 'unknown')
                   | (data['default'] == 'unknown')].index

In [19]:
data.drop(indexToDrop , inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(indexToDrop , inplace=True)


In [4]:
#split input output
def split_input_output(data, target_column):
    X = data.drop(columns = target_column)
    y = data[target_column]

    return X, y

In [5]:
X, y = split_input_output(data = data,
                          target_column = "y")

In [6]:
#train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 123)

In [7]:
y_train.value_counts(normalize=True)

no     0.887341
yes    0.112659
Name: y, dtype: float64

In [8]:
y_test.value_counts(normalize=True)

no     0.887324
yes    0.112676
Name: y, dtype: float64

In [9]:
def handlingOutput(y_train, y_test, pos_value):
    y_train = y_train.apply(lambda x: 1 if x == pos_value else 0)
    y_test = y_test.apply(lambda x: 1 if x == pos_value else 0)
    
    return y_train, y_test

In [10]:
y_train, y_test = handlingOutput(y_train = y_train, 
                                 y_test = y_test, 
                                 pos_value = "yes")

In [11]:
y_train.value_counts(normalize=True)

0    0.887341
1    0.112659
Name: y, dtype: float64

In [12]:
y_test.value_counts(normalize=True)

0    0.887324
1    0.112676
Name: y, dtype: float64

In [13]:
y_train

3369     0
11480    0
23021    0
28565    0
2635     1
        ..
38071    1
33468    1
13610    0
3919     0
35049    1
Name: y, Length: 32940, dtype: int64

In [82]:
X.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [14]:
categorical_col = ["job","marital","education","default","housing","loan","contact", "month", "day_of_week", "poutcome"]
numerical_col = ["age","duration","campaign","pdays","previous","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed"]

In [15]:
def splitNumCat(data, num_col, cat_col):
    data_num = data[num_col]
    data_cat = data[cat_col]

    return data_num, data_cat

In [16]:
X_train_num, X_train_cat = splitNumCat(data = X_train,
                                       num_col = numerical_col,
                                       cat_col = categorical_col)

In [17]:
X_train_num.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
3369,30,399,1,999,0,1.1,93.994,-36.4,4.86,5191.0
11480,34,45,4,999,0,1.4,94.465,-41.8,4.959,5228.1
23021,56,479,4,999,0,1.4,93.444,-36.1,4.965,5228.1
28565,41,301,1,999,1,-1.8,93.075,-47.1,1.415,5099.1
2635,31,945,2,999,0,1.1,93.994,-36.4,4.856,5191.0


In [18]:
# Buat imputer untuk jaga-jaga apabila di data test ada yang kosong
from sklearn.impute import SimpleImputer

def imputerNum(data, imputer = None):
    if imputer == None:
        # Buat imputer
        imputer = SimpleImputer(missing_values = np.nan,
                                strategy = "median")
        imputer.fit(data)

    # Transform data dengan imputer
    data_imputed = imputer.transform(data)
    data_imputed = pd.DataFrame(data_imputed,
                                index = data.index,
                                columns = data.columns)
    
    return data_imputed, imputer

In [19]:
X_train_num_imputed, imputer_num = imputerNum(data = X_train_num)

In [20]:
def imputerCat(data, imputer = None):
    if imputer == None:
        # Buat imputer
        imputer = SimpleImputer(missing_values = np.nan,
                                strategy = "constant",
                                fill_value = "UNKNOWN")
        imputer.fit(data)

    # Transform data dengan imputer
    data_imputed = imputer.transform(data)
    data_imputed = pd.DataFrame(data_imputed,
                                index = data.index,
                                columns = data.columns)
    
    return data_imputed, imputer

In [21]:
from sklearn.preprocessing import OneHotEncoder

def encoderCat(data, encoder_col = None, encoder = None):
    if encoder == None:
        # Buat objek
        encoder = OneHotEncoder(handle_unknown = "ignore",
                                drop = "if_binary")
        encoder.fit(data)
        encoder_col = encoder.get_feature_names_out(data.columns)

    # Transform data
    data_encoded = encoder.transform(data).toarray()
    data_encoded = pd.DataFrame(data_encoded,
                                index = data.index,
                                columns = encoder_col)
    
    return data_encoded, encoder_col, encoder


In [22]:
X_train_cat_imputed, imputer_cat = imputerCat(data = X_train_cat)

In [23]:
X_train_cat_imputed.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
3369,blue-collar,single,high.school,no,yes,no,telephone,may,thu,nonexistent
11480,admin.,divorced,basic.9y,no,yes,no,telephone,jun,fri,nonexistent
23021,admin.,divorced,university.degree,no,yes,no,cellular,aug,tue,nonexistent
28565,services,married,high.school,no,yes,yes,cellular,apr,wed,failure
2635,services,married,basic.9y,no,no,no,telephone,may,tue,nonexistent


In [24]:
X_train_cat_encoded, encoder_col, encoder_OHE = encoderCat(data = X_train_cat_imputed)

In [31]:
X_train_cat_encoded

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
3369,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
11480,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23021,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
28565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38071,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
33468,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
13610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
X_train_concat = pd.concat([X_train_num_imputed,
                            X_train_cat_encoded],
                           axis = 1)
X_train_concat.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
3369,30.0,399.0,1.0,999.0,0.0,1.1,93.994,-36.4,4.86,5191.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
11480,34.0,45.0,4.0,999.0,0.0,1.4,94.465,-41.8,4.959,5228.1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23021,56.0,479.0,4.0,999.0,0.0,1.4,93.444,-36.1,4.965,5228.1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
28565,41.0,301.0,1.0,999.0,1.0,-1.8,93.075,-47.1,1.415,5099.1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2635,31.0,945.0,2.0,999.0,0.0,1.1,93.994,-36.4,4.856,5191.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [27]:
!pip install imbalanced-learn



In [28]:
from imblearn.over_sampling import SMOTE

In [29]:
y_train.value_counts()

0    29229
1     3711
Name: y, dtype: int64

In [34]:
smt = SMOTE(sampling_strategy = "minority",
           random_state = 123)

X_train_smote, y_train = smt.fit_resample(X_train_concat, y_train)

In [35]:
y_train.value_counts()

0    29229
1    29229
Name: y, dtype: int64

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
def standardizeData(data, scaler = None):
    if scaler == None:
        # Buat & Fit scaler
        scaler = StandardScaler()
        scaler.fit(data)

    # Tranform data
    data_scaled = scaler.transform(data)
    data_scaled = pd.DataFrame(data_scaled,
                               index = data.index,
                               columns = data.columns)
    
    return data_scaled, scaler

In [38]:
X_train_clean, scaler = standardizeData(data = X_train_smote)
X_train_clean.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,-0.898328,0.034338,-0.58696,0.356138,-0.467123,0.924169,0.832819,0.721207,0.99898,0.633836,...,-0.225661,-0.193069,-0.510312,-0.529678,2.056562,-0.550766,-0.538319,-0.387316,0.548212,-0.344014
1,-0.550428,-0.958918,0.74722,0.356138,-0.467123,1.098534,1.59292,-0.310275,1.051358,1.060731,...,-0.225661,-0.193069,2.274345,-0.529678,-0.571532,-0.550766,-0.538319,-0.387316,0.548212,-0.344014
2,1.36302,0.258802,0.74722,0.356138,-0.467123,1.098534,-0.054773,0.778511,1.054533,1.060731,...,-0.225661,-0.193069,-0.510312,-0.529678,-0.571532,2.132082,-0.538319,-0.387316,0.548212,-0.344014
3,0.058396,-0.240631,-0.58696,0.356138,1.012629,-0.761364,-0.650266,-1.322654,-0.82368,-0.42362,...,-0.225661,-0.193069,-0.510312,-0.529678,-0.571532,-0.550766,2.170012,3.002639,-1.96528,-0.344014
4,-0.811353,1.566308,-0.142233,0.356138,-0.467123,0.924169,0.832819,0.721207,0.996864,0.633836,...,-0.225661,-0.193069,-0.510312,-0.529678,-0.571532,2.132082,-0.538319,-0.387316,0.548212,-0.344014


In [39]:
def transformTestData(data, num_col, cat_col, encoder_col,
                      imputer_num, imputer_cat, encoder_cat,
                      scaler):
    # 1. Split num-cat
    data_num, data_cat = splitNumCat(data = data,
                                     num_col = num_col,
                                     cat_col = cat_col)
    
    # 2. Handling num
    data_num_imputed, _ = imputerNum(data = data_num,
                                     imputer = imputer_num)
    
    # 3. Handling cat
    data_cat_imputed, _ = imputerCat(data = data_cat,
                                     imputer = imputer_cat)
    data_cat_encoded, _, _ = encoderCat(data = data_cat_imputed,
                                        encoder_col = encoder_col,
                                        encoder = encoder_cat)
    
    # 4. Concat data
    data_concat = pd.concat([data_num_imputed, data_cat_encoded],
                            axis = 1)
    
    # 5. Scale data
    data_clean, _ = standardizeData(data = data_concat,
                                    scaler = scaler)
    
    return data_clean

In [40]:
X_test_clean = transformTestData(data = X_test,
                                 num_col = numerical_col, 
                                 cat_col = categorical_col,
                                 encoder_col = encoder_col,
                                 imputer_num = imputer_num,
                                 imputer_cat = imputer_cat,
                                 encoder_cat = encoder_OHE,
                                 scaler = scaler)

X_test_clean.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
4317,1.102096,-0.563299,-0.58696,0.356138,-0.467123,0.924169,0.832819,0.721207,0.996864,0.633836,...,-0.225661,-0.193069,-0.510312,-0.529678,-0.571532,2.132082,-0.538319,-0.387316,0.548212,-0.344014
39179,-1.333203,-0.378116,0.302493,0.356138,1.012629,-0.761364,-0.175808,1.026831,-1.226835,-1.463815,...,-0.225661,-0.193069,2.274345,-0.529678,-0.571532,-0.550766,-0.538319,3.002639,-1.96528,-0.344014
7641,0.580246,-0.647473,1.191946,0.356138,-0.467123,0.924169,0.832819,0.721207,1.001096,0.633836,...,-0.225661,-0.193069,2.274345,-0.529678,-0.571532,-0.550766,-0.538319,-0.387316,0.548212,-0.344014
23175,-0.724378,-0.240631,3.415579,0.356138,-0.467123,1.098534,-0.054773,0.778511,1.054533,1.060731,...,-0.225661,-0.193069,-0.510312,-0.529678,-0.571532,2.132082,-0.538319,-0.387316,0.548212,-0.344014
22036,0.406296,-0.608192,0.302493,0.356138,-0.467123,1.098534,-0.054773,0.778511,1.054004,1.060731,...,-0.225661,-0.193069,-0.510312,-0.529678,-0.571532,-0.550766,2.170012,-0.387316,0.548212,-0.344014


In [41]:
X_test_clean.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_telephone', 'month_apr',
       'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'da

In [42]:
X_train_clean.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_telephone', 'month_apr',
       'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'da

In [43]:
# Import library
from sklearn.linear_model import LogisticRegression

In [44]:
# Buat weight class
n_samples = len(y_train)
n_classes = len(y_train.value_counts())
n_samples_j = y_train.value_counts()

class_weight = n_samples / (n_classes * n_samples_j)
class_weight

0    1.0
1    1.0
Name: y, dtype: float64

In [45]:
# Buat objek
logreg = LogisticRegression(solver = "liblinear",
                            random_state = 123)

In [46]:
# Lakukan eksperimentasi
from sklearn.model_selection import GridSearchCV

search_params = {"penalty": ["l1", "l2"],
                 "C": np.logspace(-5, 5, 20)}

logreg_cv = GridSearchCV(estimator = logreg,
                         param_grid = search_params,
                         cv = 5)

In [47]:
# Lakukan Fitting Data
logreg_cv.fit(X = X_train_clean,
              y = y_train)

GridSearchCV(cv=5,
             estimator=LogisticRegression(random_state=123, solver='liblinear'),
             param_grid={'C': array([1.00000000e-05, 3.35981829e-05, 1.12883789e-04, 3.79269019e-04,
       1.27427499e-03, 4.28133240e-03, 1.43844989e-02, 4.83293024e-02,
       1.62377674e-01, 5.45559478e-01, 1.83298071e+00, 6.15848211e+00,
       2.06913808e+01, 6.95192796e+01, 2.33572147e+02, 7.84759970e+02,
       2.63665090e+03, 8.85866790e+03, 2.97635144e+04, 1.00000000e+05]),
                         'penalty': ['l1', 'l2']})

In [48]:
logreg_cv.best_params_

{'C': 69.51927961775606, 'penalty': 'l1'}

In [50]:
# Buat best model
logreg = LogisticRegression(penalty = logreg_cv.best_params_["penalty"],
                            C = logreg_cv.best_params_["C"],
                            class_weight = dict(class_weight),
                            solver = "liblinear",
                            random_state = 123)

# Fit model
logreg.fit(X_train_clean, y_train)

LogisticRegression(C=69.51927961775606, class_weight={0: 1.0, 1: 1.0},
                   penalty='l1', random_state=123, solver='liblinear')

In [51]:
y_pred_train_proba = logreg.predict_proba(X_train_clean)
y_pred_train_proba

array([[0.8992613 , 0.1007387 ],
       [0.98001908, 0.01998092],
       [0.59134974, 0.40865026],
       ...,
       [0.00777157, 0.99222843],
       [0.03920321, 0.96079679],
       [0.12353425, 0.87646575]])

In [53]:
y_pred_train = logreg.predict(X_train_clean)
y_pred_train

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [54]:
from sklearn.metrics import classification_report

print(classification_report(y_true = y_train,
                            y_pred = y_pred_train,
                            target_names = ["no", "yes"]))

              precision    recall  f1-score   support

          no       0.89      0.86      0.88     29229
         yes       0.87      0.90      0.88     29229

    accuracy                           0.88     58458
   macro avg       0.88      0.88      0.88     58458
weighted avg       0.88      0.88      0.88     58458



In [55]:
y_pred_train_proba = logreg.predict_proba(X_test_clean)
y_pred_train_proba

array([[0.97095513, 0.02904487],
       [0.11792845, 0.88207155],
       [0.98059862, 0.01940138],
       ...,
       [0.78162178, 0.21837822],
       [0.92762241, 0.07237759],
       [0.94429175, 0.05570825]])

In [56]:
y_pred_test = logreg.predict(X_test_clean)
y_pred_test

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [57]:
from sklearn.metrics import classification_report

print(classification_report(y_true = y_test,
                            y_pred = y_pred_test,
                            target_names = ["no", "yes"]))

              precision    recall  f1-score   support

          no       0.98      0.86      0.92      7308
         yes       0.45      0.88      0.59       928

    accuracy                           0.86      8236
   macro avg       0.71      0.87      0.76      8236
weighted avg       0.92      0.86      0.88      8236

