In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import requests
import yaml

import pandas as pd

In [16]:
def load_metadata(dataset_name):
    link = f"https://raw.githubusercontent.com/EpistasisLab/pmlb/master/datasets/{dataset_name}/metadata.yaml"
    f = requests.get(link)
    return yaml.safe_load(f.text)

In [42]:
pmlb_data = pd.read_csv("../metadata/Penn Machine Learning Benchmarks.csv")
# remove feynman datasets (119)
pmlb_data = pmlb_data[~pmlb_data["Dataset"].str.contains("feynman")]
# remove fri_c datasets (60)
pmlb_data = pmlb_data[~pmlb_data["Dataset"].str.contains("fri_c")]
# drop Metadata column
pmlb_data = pmlb_data.drop("Metadata", axis=1)
pmlb_data = pmlb_data.reset_index(drop=True)
# set categorical with n_classes == 2 to binary
pmlb_data.loc[pmlb_data["n_classes"] == 2, "Endpoint"] = "binary"

In [43]:
pmlb_data

Unnamed: 0,Dataset,n_observations,n_features,n_classes,Endpoint,Imbalance,Task
0,adult,48842,14,2.0,binary,0.27,classification
1,agaricus_lepiota,8145,22,2.0,binary,0.00,classification
2,allbp,3772,29,3.0,categorical,0.88,classification
3,allhyper,3771,29,4.0,categorical,0.93,classification
4,allhypo,3770,29,3.0,categorical,0.78,classification
...,...,...,...,...,...,...,...
231,strogatz_shearflow1,400,2,,continuous,0.00,regression
232,strogatz_shearflow2,400,2,,continuous,0.00,regression
233,strogatz_vdp1,400,2,,continuous,0.00,regression
234,strogatz_vdp2,400,2,,continuous,0.00,regression


In [44]:
pmlb_data["feat_n_continuous"] = 0
pmlb_data["feat_n_categorical"] = 0
pmlb_data["feat_n_nominal"] = 0
pmlb_data["feat_n_binary"] = 0
pmlb_data["feat_n_ordinal"] = 0

for idx, dset in enumerate(pmlb_data["Dataset"].tolist()):
    m = load_metadata(dset)
    feats = {}
    for f in m["features"]:
        feats[f"feat_n_{f['type']}"] = feats.get(f"feat_n_{f['type']}", 0) + 1
    pmlb_data.loc[idx, feats.keys()] = feats.values()

# add nominal to categorical and remove categorical
pmlb_data["feat_n_categorical"] += pmlb_data["feat_n_nominal"]
pmlb_data = pmlb_data.drop(["feat_n_nominal"], axis=1)

In [45]:
pmlb_data

Unnamed: 0,Dataset,n_observations,n_features,n_classes,Endpoint,Imbalance,Task,feat_n_continuous,feat_n_categorical,feat_n_binary,feat_n_ordinal
0,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
1,agaricus_lepiota,8145,22,2.0,binary,0.00,classification,1,16,5,0
2,allbp,3772,29,3.0,categorical,0.88,classification,6,4,19,0
3,allhyper,3771,29,4.0,categorical,0.93,classification,6,4,19,0
4,allhypo,3770,29,3.0,categorical,0.78,classification,6,4,19,0
...,...,...,...,...,...,...,...,...,...,...,...
231,strogatz_shearflow1,400,2,,continuous,0.00,regression,2,0,0,0
232,strogatz_shearflow2,400,2,,continuous,0.00,regression,2,0,0,0
233,strogatz_vdp1,400,2,,continuous,0.00,regression,2,0,0,0
234,strogatz_vdp2,400,2,,continuous,0.00,regression,2,0,0,0


In [37]:
m = load_metadata("irish")

In [38]:
m

{'dataset': 'irish',
 'description': 'Educational transitions of Irish schoolchildren, aged 11 in 1967.',
 'source': 'https://www.openml.org/d/451',
 'publication': 'Monitoring the Learning Outcomes of Education Systems by Vincent Greaney and Thomas Kelleghan',
 'task': 'classification',
 'keywords': ['education', 'ses'],
 'target': {'type': 'binary',
  'description': 'Leaving certificate taken',
  'code': "'not_taken'=1, 'taken'=2\n"},
 'features': [{'name': 'Sex',
   'type': 'binary',
   'description': 'Sex of child',
   'code': "'male'=1, 'female'=0\n"},
  {'name': 'DVRT',
   'type': 'continuous',
   'description': 'Drumcondra Verbal Reasoning Test score'},
  {'name': 'Educational_level',
   'type': 'nominal',
   'description': 'attained education level',
   'code': "'3rd_level_complete'=0, '3rd_level_incomplete'=1, '?'=2, 'Junior_cycle_incomplete-secondary_school'=3,  'Junior_cycle_incomplete-vocational_school'=4, 'Junior_cycle_terminal_leaver-secondary_school'=5, 'Junior_cycle_ter

In [None]:
pmlb_data.to_csv("../metadata/pmlb_data_processed.csv")