In [1]:
# I am importing the dataset from the UC Irvine Machine Learning Repository
# See https://archive.ics.uci.edu/dataset/222/bank+marketing
# You need to run 'pip install ucimlrepo' at the command level to get package that includes the dataset

import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo 
from sklearn.linear_model import LogisticRegression

# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
data_x = bank_marketing.data.features 
data_y = bank_marketing.data.targets

In [2]:
# this is just to convert the label y into a 0/1 variable
pd.set_option('future.no_silent_downcasting', True)
y = data_y['y'].replace({'no': 0, 'yes': 1}).astype('int')

In [3]:
# this creates one hot encoding of all dummy variables
data_encoded = pd.get_dummies(data_x, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'month'])

In [4]:
print(data_encoded.columns)

Index(['age', 'balance', 'contact', 'day_of_week', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'marital_divorced', 'marital_married',
       'marital_single', 'education_primary', 'education_secondary',
       'education_tertiary', 'default_no', 'default_yes', 'housing_no',
       'housing_yes', 'loan_no', 'loan_yes', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep'],
      dtype='object')


In [5]:
# This is a selection of the X variables used
X = data_encoded[['balance', 'default_yes', 'housing_yes', 'loan_yes']]

In [6]:
# Here, we fit the model
model = LogisticRegression(max_iter=4000)
model.fit(X, y.values.ravel())

In [7]:
#This displays the model coefficients; scikit learn does not automatically give you p-values. The package statsmodels does.
coefficients = model.coef_[0]
intercept = model.intercept_[0]
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients
})
coef_df = pd.concat([coef_df, pd.DataFrame({'Feature': 'Intercept', 'Coefficient': [intercept]})])
print(coef_df)

       Feature  Coefficient
0      balance     0.000027
1  default_yes    -0.514384
2  housing_yes    -0.851010
3     loan_yes    -0.629064
0    Intercept    -1.572991


In [9]:
# calculating the predicted probabilities
y_pred = model.predict_proba(X)[:,1]

In [10]:
# a comparison of predicted probabilities across true outcome categories
print(y_pred)
prob_true_actuals = y_pred[y == 1]
prob_false_actuals = y_pred[y == 0]

# Calculate the average predicted probability for true actuals (label = 1)
average_prob_true = np.mean(prob_true_actuals)

# Calculate the average predicted probability for false actuals (label = 0)
average_prob_false = np.mean(prob_false_actuals)

print(f"Average predicted probability for true actuals (label = 1): {average_prob_true:.4f}")
print(f"Average predicted probability for false actuals (label = 0): {average_prob_false:.4f}")

[0.08580238 0.08141944 0.0450877  ... 0.19492605 0.17437808 0.18353474]
Average predicted probability for true actuals (label = 1): 0.1399
Average predicted probability for false actuals (label = 0): 0.1140
