<p style="page-break-after:always;"></p>

# Regresión logística aplicada a openml

In [1]:
import warnings; warnings.filterwarnings("ignore"); import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
def err_eval(data_id):
    X, y = fetch_openml(data_id=data_id, return_X_y=True, as_frame=False, parser="liac-arff")
    mask = ~np.isnan(X).any(axis=1); X = X[mask, :]; y = y[mask]
    if X.shape[0] < 10: return(1.0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)
    clf = LogisticRegression(random_state=23).fit(X_train, y_train)
    return(1 - accuracy_score(y_test, clf.predict(X_test)))

In [3]:
import openml
for sid in (99, 334):
    benchmark_suite = openml.study.get_suite(suite_id=sid)
    df = openml.datasets.list_datasets(data_id=benchmark_suite.data, output_format='dataframe')
    for did, name, C in zip(df['did'], df['name'], df['NumberOfClasses']):
        err = err_eval(did)
        print(f"sid: {sid:5d}  did: {did:5d}  C: {C:5.0f}  err: {err:7.1%}  name: {name:s}")

sid:    99  did:     3  C:     2  err:    5.5%  name: kr-vs-kp
sid:    99  did:     6  C:    26  err:   23.7%  name: letter
sid:    99  did:    11  C:     3  err:    8.0%  name: balance-scale
sid:    99  did:    12  C:    10  err:    4.2%  name: mfeat-factors
sid:    99  did:    14  C:    10  err:   21.0%  name: mfeat-fourier
sid:    99  did:    15  C:     2  err:    2.2%  name: breast-w
sid:    99  did:    16  C:    10  err:    4.5%  name: mfeat-karhunen
sid:    99  did:    18  C:    10  err:   46.8%  name: mfeat-morphological
sid:    99  did:    22  C:    10  err:   18.2%  name: mfeat-zernike
sid:    99  did:    23  C:     3  err:   51.2%  name: cmc
sid:    99  did:    28  C:    10  err:    3.3%  name: optdigits
sid:    99  did:    29  C:     2  err:   13.7%  name: credit-approval
sid:    99  did:    31  C:     2  err:   25.5%  name: credit-g
sid:    99  did:    32  C:    10  err:    6.0%  name: pendigits
sid:    99  did:    37  C:     2  err:   17.5%  name: diabetes
sid:    99  did:

<p style="page-break-after:always;"></p>

In [4]:
import openml
for sid in (271, ):
    benchmark_suite = openml.study.get_suite(suite_id=sid)
    df = openml.datasets.list_datasets(data_id=benchmark_suite.data, output_format='dataframe')
    for did, name, C in zip(df['did'], df['name'], df['NumberOfClasses']):
        if did == 41147: continue;
        err = err_eval(did)
        print(f"sid: {sid:5d}  did: {did:5d}  C: {C:5.0f}  err: {err:7.1%}  name: {name:s}")

sid:   271  did:     3  C:     2  err:    5.5%  name: kr-vs-kp
sid:   271  did:    12  C:    10  err:    4.2%  name: mfeat-factors
sid:   271  did:    23  C:     3  err:   51.2%  name: cmc
sid:   271  did:    31  C:     2  err:   25.5%  name: credit-g
sid:   271  did:    54  C:     4  err:   29.4%  name: vehicle
sid:   271  did:   181  C:    10  err:   45.1%  name: yeast
sid:   271  did:   188  C:     5  err:   60.5%  name: eucalyptus
sid:   271  did:  1049  C:     2  err:   12.0%  name: pc4
sid:   271  did:  1067  C:     2  err:   17.5%  name: kc1
sid:   271  did:  1111  C:     2  err:  100.0%  name: KDDCup09_appetency
sid:   271  did:  1169  C:     2  err:   42.1%  name: airlines
sid:   271  did:  1457  C:    50  err:   37.3%  name: amazon-commerce-reviews
sid:   271  did:  1461  C:     2  err:   11.5%  name: bank-marketing
sid:   271  did:  1464  C:     2  err:   27.3%  name: blood-transfusion-service-center
sid:   271  did:  1468  C:     9  err:    6.0%  name: cnae-9
sid:   271  di