In [1]:
import pandas as pd

In [3]:
default_data = pd.read_csv('../Data Sets/Default.csv')

In [4]:
default_data.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


$$
\log\left(\frac{p(X)}{1 - p(X)}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p
$$

$$
p(X) = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}
$$


In [6]:
default_data_encoded = pd.get_dummies(data=default_data, columns=['student'], drop_first=True)

In [7]:
default_data_encoded.head()

Unnamed: 0,default,balance,income,student_Yes
0,No,729.526495,44361.625074,False
1,No,817.180407,12106.1347,True
2,No,1073.549164,31767.138947,False
3,No,529.250605,35704.493935,False
4,No,785.655883,38463.495879,False


In [8]:
X = default_data_encoded.drop('default', axis=1)

In [9]:
X.head()

Unnamed: 0,balance,income,student_Yes
0,729.526495,44361.625074,False
1,817.180407,12106.1347,True
2,1073.549164,31767.138947,False
3,529.250605,35704.493935,False
4,785.655883,38463.495879,False


In [10]:
y = default_data_encoded['default']

In [11]:
y.head()

0    No
1    No
2    No
3    No
4    No
Name: default, dtype: object

In [12]:
from sklearn.linear_model import LogisticRegression

In [40]:
model = LogisticRegression(solver='newton-cg')

In [41]:
model.fit(X, y)



In [42]:
model.intercept_

array([-10.86390203])

In [46]:
model.coef_

array([[ 5.71857291e-03,  3.51656179e-06, -6.22207599e-01]])

In [17]:
unseen_X = pd.DataFrame({
    'balance': [1000, 2000],
    'income': [40000, 50000],
    'student_Yes': [True, False]
})

In [18]:
unseen_X.head()

Unnamed: 0,balance,income,student_Yes
0,1000,40000,True
1,2000,50000,False


In [19]:
predictions = model.predict(unseen_X)

In [20]:
predictions

array(['No', 'No'], dtype=object)

In [21]:
prediction_probabilities = model.predict_proba(unseen_X)

In [22]:
prediction_probabilities

array([[9.99700970e-01, 2.99029740e-04],
       [8.14799409e-01, 1.85200591e-01]])

In [23]:
model.classes_

array(['No', 'Yes'], dtype=object)

In [25]:
pd.concat([
    unseen_X,
    pd.DataFrame(prediction_probabilities, columns=['No', 'Yes']),
    pd.DataFrame(predictions, columns=['Predictions'])
], axis=1)

Unnamed: 0,balance,income,student_Yes,No,Yes,Predictions
0,1000,40000,True,0.999701,0.000299,No
1,2000,50000,False,0.814799,0.185201,No


In [26]:
import statsmodels.api as sm

In [27]:
X.head()

Unnamed: 0,balance,income,student_Yes
0,729.526495,44361.625074,False
1,817.180407,12106.1347,True
2,1073.549164,31767.138947,False
3,529.250605,35704.493935,False
4,785.655883,38463.495879,False


In [28]:
y.head()

0    No
1    No
2    No
3    No
4    No
Name: default, dtype: object

In [29]:
X_sm = default_data.drop('default', axis=1)

In [30]:
X_sm.head()

Unnamed: 0,student,balance,income
0,No,729.526495,44361.625074
1,Yes,817.180407,12106.1347
2,No,1073.549164,31767.138947
3,No,529.250605,35704.493935
4,No,785.655883,38463.495879


In [31]:
X_sm['student'] = X_sm['student'].map({'Yes': 1, 'No': 0})

In [32]:
X_sm.head()

Unnamed: 0,student,balance,income
0,0,729.526495,44361.625074
1,1,817.180407,12106.1347
2,0,1073.549164,31767.138947
3,0,529.250605,35704.493935
4,0,785.655883,38463.495879


In [33]:
X_sm = sm.add_constant(X_sm)

In [34]:
X_sm.head()

Unnamed: 0,const,student,balance,income
0,1.0,0,729.526495,44361.625074
1,1.0,1,817.180407,12106.1347
2,1.0,0,1073.549164,31767.138947
3,1.0,0,529.250605,35704.493935
4,1.0,0,785.655883,38463.495879


In [35]:
y_sm = y.map({'Yes': 1, 'No': 0})

In [36]:
y_sm.head()

0    0
1    0
2    0
3    0
4    0
Name: default, dtype: int64

In [37]:
model_sm = sm.Logit(y_sm, X_sm).fit()

Optimization terminated successfully.
         Current function value: 0.078577
         Iterations 10


In [38]:
model_sm.params

const     -10.869045
student    -0.646776
balance     0.005737
income      0.000003
dtype: float64

In [45]:
model.coef_

array([[ 5.71857291e-03,  3.51656179e-06, -6.22207599e-01]])

In [47]:
print(model_sm.summary())

                           Logit Regression Results                           
Dep. Variable:                default   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9996
Method:                           MLE   Df Model:                            3
Date:                Fri, 18 Apr 2025   Pseudo R-squ.:                  0.4619
Time:                        18:23:20   Log-Likelihood:                -785.77
converged:                       True   LL-Null:                       -1460.3
Covariance Type:            nonrobust   LLR p-value:                3.257e-292
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -10.8690      0.492    -22.079      0.000     -11.834      -9.904
student       -0.6468      0.236     -2.738      0.006      -1.110      -0.184
balance        0.0057      0.000     24.737      0.0