In [1]:
import os
import requests
import numpy as np
import polars as pl
from io import BytesIO
from scipy.io import arff
from category_encoders import TargetEncoder, LeaveOneOutEncoder
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
# consts
ARFF_DATASET = 'dataset_31_credit-g.arff'
TUNGUZ_DATASET_REPO_URL = 'https://raw.githubusercontent.com/tunguz/TabularBenchmarks/main/datasets/credit-g/input/'

In [3]:
url = f'{TUNGUZ_DATASET_REPO_URL}{ARFF_DATASET}'

if os.path.exists(ARFF_DATASET):
    with open(ARFF_DATASET, 'rt') as f:
        data, meta = arff.loadarff(f)
else:
    response = requests.get(url)
    if response.status_code == 200:
        with open(ARFF_DATASET, 'wb') as f:
            f.write(response.content)
        # Reopen the file in text mode for reading with arff.loadarff
        with open(ARFF_DATASET, 'rt') as f:
            data, meta = arff.loadarff(f)
    else:
        print(f"Couldn't download the file: {url}")

arff_data = pl.DataFrame(data)

In [4]:
data[0]

(b'<0', 6., b'critical/other existing credit', b'radio/tv', 1169., b'no known savings', b'>=7', 4., b'male single', b'none', 4., b'real estate', 67., b'none', b'own', 2., b'skilled', 1., b'yes', b'yes', b'good')

In [5]:
meta

Dataset: german_credit
	checking_status's type is nominal, range is ('<0', '0<=X<200', '>=200', 'no checking')
	duration's type is numeric
	credit_history's type is nominal, range is ('no credits/all paid', 'all paid', 'existing paid', 'delayed previously', 'critical/other existing credit')
	purpose's type is nominal, range is ('new car', 'used car', 'furniture/equipment', 'radio/tv', 'domestic appliance', 'repairs', 'education', 'vacation', 'retraining', 'business', 'other')
	credit_amount's type is numeric
	savings_status's type is nominal, range is ('<100', '100<=X<500', '500<=X<1000', '>=1000', 'no known savings')
	employment's type is nominal, range is ('unemployed', '<1', '1<=X<4', '4<=X<7', '>=7')
	installment_commitment's type is numeric
	personal_status's type is nominal, range is ('male div/sep', 'female div/dep/mar', 'male single', 'male mar/wid', 'female single')
	other_parties's type is nominal, range is ('none', 'co applicant', 'guarantor')
	residence_since's type is nume

In [6]:
arff_data.head()

checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
binary,f64,binary,binary,f64,binary,binary,f64,binary,binary,f64,binary,f64,binary,binary,f64,binary,f64,binary,binary,binary
[binary data],6.0,[binary data],[binary data],1169.0,[binary data],[binary data],4.0,[binary data],[binary data],4.0,[binary data],67.0,[binary data],[binary data],2.0,[binary data],1.0,[binary data],[binary data],[binary data]
[binary data],48.0,[binary data],[binary data],5951.0,[binary data],[binary data],2.0,[binary data],[binary data],2.0,[binary data],22.0,[binary data],[binary data],1.0,[binary data],1.0,[binary data],[binary data],[binary data]
[binary data],12.0,[binary data],[binary data],2096.0,[binary data],[binary data],2.0,[binary data],[binary data],3.0,[binary data],49.0,[binary data],[binary data],1.0,[binary data],2.0,[binary data],[binary data],[binary data]
[binary data],42.0,[binary data],[binary data],7882.0,[binary data],[binary data],2.0,[binary data],[binary data],4.0,[binary data],45.0,[binary data],[binary data],1.0,[binary data],2.0,[binary data],[binary data],[binary data]
[binary data],24.0,[binary data],[binary data],4870.0,[binary data],[binary data],3.0,[binary data],[binary data],4.0,[binary data],53.0,[binary data],[binary data],2.0,[binary data],2.0,[binary data],[binary data],[binary data]
