In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def importData(filename):
    # Read Data
    data = pd.read_csv(filename,delimiter=';')
    print(f"shape awal                    : {data.shape}, (#observasi, #fitur)")

    # Drop duplicate
    data = data.drop_duplicates()
    print(f"shape setelah drop duplikat   : {data.shape}, (#observasi, #fitur)")

    return data

In [3]:
filename = "data/bank-additional-full.csv"
data = importData(filename = filename)

data.head()

shape awal                    : (41188, 21), (#observasi, #fitur)
shape setelah drop duplikat   : (41176, 21), (#observasi, #fitur)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
#split input output
def split_input_output(data, target_column):
    X = data.drop(columns = target_column)
    y = data[target_column]

    return X, y

In [5]:
X, y = split_input_output(data = data,
                          target_column = "y")

In [6]:
#train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 123)

In [7]:
y_train.value_counts(normalize=True)

no     0.887341
yes    0.112659
Name: y, dtype: float64

In [8]:
y_test.value_counts(normalize=True)

no     0.887324
yes    0.112676
Name: y, dtype: float64

In [9]:
def handlingOutput(y_train, y_test, pos_value):
    y_train = y_train.apply(lambda x: 1 if x == pos_value else 0)
    y_test = y_test.apply(lambda x: 1 if x == pos_value else 0)
    
    return y_train, y_test

In [10]:
y_train, y_test = handlingOutput(y_train = y_train, 
                                 y_test = y_test, 
                                 pos_value = "yes")

In [11]:
y_train.value_counts(normalize=True)

0    0.887341
1    0.112659
Name: y, dtype: float64

In [12]:
y_test.value_counts(normalize=True)

0    0.887324
1    0.112676
Name: y, dtype: float64

In [13]:
def convertPdaysGroup(df):
    bins = [0, 7, 14, 30]
    labels = ['1w', '2w', '>2w']
    df['pdays_group'] = pd.cut(df['pdays'], bins=bins, labels=labels, include_lowest=False)
    # change the dtype as object
    df['pdays_group'] = df['pdays_group'].astype('O')

    # fillna as Not contacted
    df['pdays_group'].fillna('Not contacted', inplace=True)
    df['pdays_group'].value_counts()
    df.drop(columns=['pdays'], axis=1, inplace=True)
    
    return df

X_train = convertPdaysGroup(X_train)
X_train

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,pdays_group
3369,30,blue-collar,single,high.school,no,yes,no,telephone,may,thu,399,1,0,nonexistent,1.1,93.994,-36.4,4.860,5191.0,Not contacted
11480,34,admin.,divorced,basic.9y,no,yes,no,telephone,jun,fri,45,4,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,Not contacted
23021,56,admin.,divorced,university.degree,no,yes,no,cellular,aug,tue,479,4,0,nonexistent,1.4,93.444,-36.1,4.965,5228.1,Not contacted
28565,41,services,married,high.school,no,yes,yes,cellular,apr,wed,301,1,1,failure,-1.8,93.075,-47.1,1.415,5099.1,Not contacted
2635,31,services,married,basic.9y,no,no,no,telephone,may,tue,945,2,0,nonexistent,1.1,93.994,-36.4,4.856,5191.0,Not contacted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38071,70,retired,married,basic.4y,no,no,no,cellular,sep,tue,150,1,2,success,-3.4,92.379,-29.8,0.770,5017.5,1w
33468,28,blue-collar,single,basic.9y,no,yes,yes,cellular,may,tue,637,3,0,nonexistent,-1.8,92.893,-46.2,1.291,5099.1,Not contacted
13610,32,technician,single,university.degree,no,yes,no,cellular,jul,thu,100,1,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,Not contacted
3919,29,technician,married,professional.course,no,yes,no,telephone,may,mon,35,1,0,nonexistent,1.1,93.994,-36.4,4.858,5191.0,Not contacted


In [14]:
def convertAgeGroup(df): 
    bins = [16, 30, 40, 50, 60, 100]
    labels = ['<30', '31-40', '41-50', '51-60', '>60']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, include_lowest=True)
    df.drop(columns=['age'], axis=1, inplace=True)
    return df

In [15]:
X_train = convertAgeGroup(X_train)
X_train['age_group'].value_counts()

31-40    13120
41-50     8160
<30       5887
51-60     5039
>60        734
Name: age_group, dtype: int64

In [16]:
X.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [17]:
categorical_col = ["job","marital","education","default","housing","loan","contact", "month", "day_of_week", "poutcome", "pdays_group","age_group"]
numerical_col = ["duration","campaign","previous","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed"]

In [18]:
def splitNumCat(data, num_col, cat_col):
    data_num = data[num_col]
    data_cat = data[cat_col]

    return data_num, data_cat

In [19]:
X_train_num, X_train_cat = splitNumCat(data = X_train,
                                       num_col = numerical_col,
                                       cat_col = categorical_col)

In [20]:
X_train_num.head()

Unnamed: 0,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
3369,399,1,0,1.1,93.994,-36.4,4.86,5191.0
11480,45,4,0,1.4,94.465,-41.8,4.959,5228.1
23021,479,4,0,1.4,93.444,-36.1,4.965,5228.1
28565,301,1,1,-1.8,93.075,-47.1,1.415,5099.1
2635,945,2,0,1.1,93.994,-36.4,4.856,5191.0


In [21]:
# Buat imputer untuk jaga-jaga apabila di data test ada yang kosong
from sklearn.impute import SimpleImputer

def imputerNum(data, imputer = None):
    if imputer == None:
        # Buat imputer
        imputer = SimpleImputer(missing_values = np.nan,
                                strategy = "median")
        imputer.fit(data)

    # Transform data dengan imputer
    data_imputed = imputer.transform(data)
    data_imputed = pd.DataFrame(data_imputed,
                                index = data.index,
                                columns = data.columns)
    
    return data_imputed, imputer

In [22]:
X_train_num_imputed, imputer_num = imputerNum(data = X_train_num)

In [23]:
def imputerCat(data, imputer = None):
    if imputer == None:
        # Buat imputer
        imputer = SimpleImputer(missing_values = np.nan,
                                strategy = "constant",
                                fill_value = "UNKNOWN")
        imputer.fit(data)

    # Transform data dengan imputer
    data_imputed = imputer.transform(data)
    data_imputed = pd.DataFrame(data_imputed,
                                index = data.index,
                                columns = data.columns)
    
    return data_imputed, imputer

In [24]:
from sklearn.preprocessing import OneHotEncoder

def encoderCat(data, encoder_col = None, encoder = None):
    if encoder == None:
        # Buat objek
        encoder = OneHotEncoder(handle_unknown = "ignore",
                                drop = "if_binary")
        encoder.fit(data)
        encoder_col = encoder.get_feature_names_out(data.columns)

    # Transform data
    data_encoded = encoder.transform(data).toarray()
    data_encoded = pd.DataFrame(data_encoded,
                                index = data.index,
                                columns = encoder_col)
    
    return data_encoded, encoder_col, encoder


In [25]:
X_train_cat_imputed, imputer_cat = imputerCat(data = X_train_cat)

In [26]:
X_train_cat_imputed.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,pdays_group,age_group
3369,blue-collar,single,high.school,no,yes,no,telephone,may,thu,nonexistent,Not contacted,<30
11480,admin.,divorced,basic.9y,no,yes,no,telephone,jun,fri,nonexistent,Not contacted,31-40
23021,admin.,divorced,university.degree,no,yes,no,cellular,aug,tue,nonexistent,Not contacted,51-60
28565,services,married,high.school,no,yes,yes,cellular,apr,wed,failure,Not contacted,41-50
2635,services,married,basic.9y,no,no,no,telephone,may,tue,nonexistent,Not contacted,31-40


In [27]:
X_train_cat_encoded, encoder_col, encoder_OHE = encoderCat(data = X_train_cat_imputed)

In [28]:
X_train_cat_encoded

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,pdays_group_1w,pdays_group_2w,pdays_group_>2w,pdays_group_Not contacted,age_group_31-40,age_group_41-50,age_group_51-60,age_group_<30,age_group_>60
3369,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
11480,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
23021,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
28565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38071,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
33468,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
13610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [29]:
X_train_concat = pd.concat([X_train_num_imputed,
                            X_train_cat_encoded],
                           axis = 1)
X_train_concat.head()

Unnamed: 0,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,job_blue-collar,...,poutcome_success,pdays_group_1w,pdays_group_2w,pdays_group_>2w,pdays_group_Not contacted,age_group_31-40,age_group_41-50,age_group_51-60,age_group_<30,age_group_>60
3369,399.0,1.0,0.0,1.1,93.994,-36.4,4.86,5191.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
11480,45.0,4.0,0.0,1.4,94.465,-41.8,4.959,5228.1,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
23021,479.0,4.0,0.0,1.4,93.444,-36.1,4.965,5228.1,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
28565,301.0,1.0,1.0,-1.8,93.075,-47.1,1.415,5099.1,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2635,945.0,2.0,0.0,1.1,93.994,-36.4,4.856,5191.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [30]:
from imblearn.over_sampling import SMOTE

In [31]:
y_train.value_counts()

0    29229
1     3711
Name: y, dtype: int64

In [34]:
sm = SMOTE(sampling_strategy = 'not majority', k_neighbors = 50, random_state = 123)

X_train_smote, y_train = sm.fit_resample(X_train_concat, y_train)

In [27]:
smt = SMOTE(sampling_strategy = "minority",
           random_state = 123)

X_train_smote, y_train = smt.fit_resample(X_train_concat, y_train)

In [35]:
y_train.value_counts()

0    29229
1    29229
Name: y, dtype: int64

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
def standardizeData(data, scaler = None):
    if scaler == None:
        # Buat & Fit scaler
        scaler = StandardScaler()
        scaler.fit(data)

    # Tranform data
    data_scaled = scaler.transform(data)
    data_scaled = pd.DataFrame(data_scaled,
                               index = data.index,
                               columns = data.columns)
    
    return data_scaled, scaler

In [38]:
X_train_clean, scaler = standardizeData(data = X_train_smote)
X_train_clean.head()

Unnamed: 0,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,job_blue-collar,...,poutcome_success,pdays_group_1w,pdays_group_2w,pdays_group_>2w,pdays_group_Not contacted,age_group_31-40,age_group_41-50,age_group_51-60,age_group_<30,age_group_>60
0,0.035716,-0.591842,-0.487257,0.919762,0.842803,0.740703,0.99611,0.628723,-0.660505,2.208401,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,-0.831975,-0.567175,-0.454204,2.145009,-0.265275
1,-0.965261,0.728977,-0.487257,1.094948,1.615969,-0.309178,1.048538,1.056442,1.783943,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,1.394762,-0.567175,-0.454204,-0.553795,-0.265275
2,0.261925,0.728977,-0.487257,1.094948,-0.060043,0.79903,1.051715,1.056442,1.783943,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,-0.831975,-0.567175,2.57684,-0.553795,-0.265275
3,-0.241391,-0.591842,1.079125,-0.773699,-0.665772,-1.339616,-0.828287,-0.430774,-0.660505,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,-0.831975,2.015531,-0.454204,-0.553795,-0.265275
4,1.579595,-0.151569,-0.487257,0.919762,0.842803,0.740703,0.993991,0.628723,-0.660505,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,1.394762,-0.567175,-0.454204,-0.553795,-0.265275


In [39]:
def transformTestData(data, num_col, cat_col, encoder_col,
                      imputer_num, imputer_cat, encoder_cat,
                      scaler):
    
    data_conv_pdays = convertPdaysGroup(df=data)
    
    data_conv_age = convertAgeGroup(df=data_conv_pdays)
    
    
    # 1. Split num-cat
    data_num, data_cat = splitNumCat(data = data_conv_age,
                                     num_col = num_col,
                                     cat_col = cat_col)
    
    # 2. Handling num
    data_num_imputed, _ = imputerNum(data = data_num,
                                     imputer = imputer_num)
    
    # 3. Handling cat
    data_cat_imputed, _ = imputerCat(data = data_cat,
                                     imputer = imputer_cat)
    data_cat_encoded, _, _ = encoderCat(data = data_cat_imputed,
                                        encoder_col = encoder_col,
                                        encoder = encoder_cat)
    
    # 4. Concat data
    data_concat = pd.concat([data_num_imputed, data_cat_encoded],
                            axis = 1)
    
    # 5. Scale data
    data_clean, _ = standardizeData(data = data_concat,
                                    scaler = scaler)
    
    return data_clean

In [40]:
X_test_clean = transformTestData(data = X_test,
                                 num_col = numerical_col, 
                                 cat_col = categorical_col,
                                 encoder_col = encoder_col,
                                 imputer_num = imputer_num,
                                 imputer_cat = imputer_cat,
                                 encoder_cat = encoder_OHE,
                                 scaler = scaler)

X_test_clean.head()

Unnamed: 0,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,job_blue-collar,...,poutcome_success,pdays_group_1w,pdays_group_2w,pdays_group_>2w,pdays_group_Not contacted,age_group_31-40,age_group_41-50,age_group_51-60,age_group_<30,age_group_>60
4317,-0.566567,-0.591842,-0.487257,0.919762,0.842803,0.740703,0.993991,0.628723,-0.660505,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,-0.831975,-0.567175,2.57684,-0.553795,-0.265275
39179,-0.379944,0.288704,1.079125,-0.773699,-0.183159,1.051779,-1.231826,-1.472977,-0.660505,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,-0.831975,-0.567175,-0.454204,2.145009,-0.265275
7641,-0.651395,1.169251,-0.487257,0.919762,0.842803,0.740703,0.998228,0.628723,-0.660505,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,-0.831975,2.015531,-0.454204,-0.553795,-0.265275
23175,-0.241391,3.370617,-0.487257,1.094948,-0.060043,0.79903,1.051715,1.056442,-0.660505,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,1.394762,-0.567175,-0.454204,-0.553795,-0.265275
22036,-0.611809,0.288704,-0.487257,1.094948,-0.060043,0.79903,1.051186,1.056442,-0.660505,-0.508188,...,-0.373752,-0.343104,-0.159796,-0.075757,0.387096,-0.831975,2.015531,-0.454204,-0.553795,-0.265275


In [41]:
X_test_clean.columns

Index(['duration', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_telephone', 'month_apr',
       'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri',


In [42]:
X_train_clean.columns

Index(['duration', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_telephone', 'month_apr',
       'month_aug', 'month_dec', 'month_jul', 'month_jun', 'month_mar',
       'month_may', 'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri',


In [43]:
# Import library
from sklearn.linear_model import LogisticRegression

In [46]:
# Buat objek
logreg = LogisticRegression(solver = "liblinear",
                            random_state = 123)

In [49]:
# Lakukan eksperimentasi
from sklearn.model_selection import GridSearchCV

search_params = {"penalty": ["l1", "l2"],
                 "C": np.logspace(-5, 5, 20)}

logreg_cv = GridSearchCV(estimator = logreg,
                         param_grid = search_params,
                         cv = 5)

In [50]:
# Lakukan Fitting Data
logreg_cv.fit(X = X_train_clean,
              y = y_train)

GridSearchCV(cv=5,
             estimator=LogisticRegression(random_state=123, solver='liblinear'),
             param_grid={'C': array([1.00000000e-05, 3.35981829e-05, 1.12883789e-04, 3.79269019e-04,
       1.27427499e-03, 4.28133240e-03, 1.43844989e-02, 4.83293024e-02,
       1.62377674e-01, 5.45559478e-01, 1.83298071e+00, 6.15848211e+00,
       2.06913808e+01, 6.95192796e+01, 2.33572147e+02, 7.84759970e+02,
       2.63665090e+03, 8.85866790e+03, 2.97635144e+04, 1.00000000e+05]),
                         'penalty': ['l1', 'l2']})

In [51]:
logreg_cv.best_params_

{'C': 0.5455594781168515, 'penalty': 'l1'}

In [52]:
# Buat best model
logreg = LogisticRegression(penalty = logreg_cv.best_params_["penalty"],
                            C = logreg_cv.best_params_["C"],
                            solver = "liblinear",
                            random_state = 123)

# Fit model
logreg.fit(X_train_clean, y_train)

LogisticRegression(C=0.5455594781168515, penalty='l1', random_state=123,
                   solver='liblinear')

In [53]:
y_pred_train_proba = logreg.predict_proba(X_train_clean)
y_pred_train_proba

array([[0.88987886, 0.11012114],
       [0.9796978 , 0.0203022 ],
       [0.51226011, 0.48773989],
       ...,
       [0.51190677, 0.48809323],
       [0.20238941, 0.79761059],
       [0.62949238, 0.37050762]])

In [54]:
y_pred_train = logreg.predict(X_train_clean)
y_pred_train

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [55]:
from sklearn.metrics import classification_report

print(classification_report(y_true = y_train,
                            y_pred = y_pred_train,
                            target_names = ["no", "yes"]))

              precision    recall  f1-score   support

          no       0.89      0.86      0.88     29229
         yes       0.87      0.90      0.88     29229

    accuracy                           0.88     58458
   macro avg       0.88      0.88      0.88     58458
weighted avg       0.88      0.88      0.88     58458



In [56]:
y_pred_train_proba = logreg.predict_proba(X_test_clean)
y_pred_train_proba

array([[0.97212466, 0.02787534],
       [0.09247092, 0.90752908],
       [0.98395184, 0.01604816],
       ...,
       [0.77919427, 0.22080573],
       [0.93766424, 0.06233576],
       [0.94497687, 0.05502313]])

In [57]:
y_pred_test = logreg.predict(X_test_clean)
y_pred_test

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [58]:
from sklearn.metrics import classification_report

print(classification_report(y_true = y_test,
                            y_pred = y_pred_test,
                            target_names = ["no", "yes"]))

              precision    recall  f1-score   support

          no       0.98      0.87      0.92      7308
         yes       0.45      0.87      0.60       928

    accuracy                           0.87      8236
   macro avg       0.72      0.87      0.76      8236
weighted avg       0.92      0.87      0.88      8236

