# Logistic model

## Import libraries and load data

In [None]:
import pandas as pd
import numpy as np
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression

# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# from sklearn.metrics import confusion_matrix, classification_report, precision_score
# from sklearn import preprocessing
# from sklearn import neighbors

from statsmodels.formula.api import logit

%matplotlib inline

In [None]:
data_url = "https://github.com/pykale/transparentML/raw/main/data/Default.csv"
df = pd.read_csv(data_url)

# Note: factorize() returns two objects: a label array and an array with the unique values.
# We are only interested in the first object.
df["default2"] = df.default.factorize()[0]
df["student2"] = df.student.factorize()[0]
df.head(3)

### Logistic model

Logistic regression models the probability that `y` belongs to a particular category rather than modelling this response `y` directly. For the `Default` data, logistic regression models the probability of default. For example, the probability of default given balance can be written as

$$
\mathbb{P}(\text{default} = \text{Yes} \mid \text{balance}).
$$

Example of `scikit-learn`

In [None]:
clf = LogisticRegression(solver="newton-cg")
X_train = df.balance.values.reshape(-1, 1)
clf.fit(X_train, y)
print(clf)
print("classes: ", clf.classes_)
print("coefficients: ", clf.coef_)
print("intercept :", clf.intercept_)

Example of `statsmodels`

In [None]:
est = logit("default2 ~ balance", df).fit()
est.summary2().tables[1]

In [None]:
est = logit("default2 ~ student", df).fit()
est.summary2().tables[1]