Imports

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import os


Load credit-g dataset

In [2]:
credit = fetch_openml("credit-g", version=1, as_frame=True)
X = credit.data
y = credit.target

X.head()


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,4,real estate,67,none,own,2,skilled,1,yes,yes
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,2,real estate,22,none,own,1,skilled,1,none,yes
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,3,real estate,49,none,own,1,unskilled resident,2,none,yes
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,4,life insurance,45,none,for free,1,skilled,2,none,yes
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,4,no known property,53,none,for free,2,skilled,2,none,yes


Preprocessing

In [3]:
categorical = X.select_dtypes(include=["object", "category"]).columns
numeric = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns

preprocess = ColumnTransformer([
    ("num", StandardScaler(), numeric),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
])


Train-test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


Baseline Logistic Regression

In [5]:
baseline_model = Pipeline([
    ("pre", preprocess),
    ("clf", LogisticRegression(max_iter=200))
])

baseline_model.fit(X_train, y_train)
baseline_pred = baseline_model.predict(X_test)

baseline_f1 = f1_score(y_test, baseline_pred, average="macro")
baseline_f1


0.6361505966513521

Save baseline results

In [6]:
os.makedirs("../results", exist_ok=True)

baseline_df = pd.DataFrame({
    "dataset": ["credit-g"],
    "model": ["logistic_regression"],
    "baseline_f1_macro": [baseline_f1]
})

baseline_df.to_csv("../results/lr_creditg_baseline.csv", index=False)

baseline_df


Unnamed: 0,dataset,model,baseline_f1_macro
0,credit-g,logistic_regression,0.636151
