Basically, there are 3 different types of tasks:

1. Regression
2. Binary Classification
3. Mulitclass Classification

However, we can split them further:

3. Mulitclass Classification into ..
    - 3-class
    - 4-class
    - ...

In [1]:
import openml

import pandas as pd

from sklearn.datasets._openml import _get_data_description_by_id

## Filtering available datasets

- Active datasets
- No missing values
    - If we have missing values in the original dataset, we can not trust the downstream task performance changes
- 3k to 100k instances
- 5 to 25 features
- Drop duplicated datasets
- Drop datasets with the same name
- Drop datasets with no information about the number of classes
- Drop datasets where number of features, number of instances, class distribution are the same (high probability to be duplicated)
- Remove some duplicated by hand
- (At the end) we only use 50 datasets for each task (regression, binary, multiclass)

In [2]:
# First get all available datasets
all_datasets = openml.tasks.list_tasks(output_format="dataframe")

In [3]:
datasets = all_datasets.copy()

# Datasets without missing values
datasets = datasets[datasets["NumberOfInstancesWithMissingValues"] == 0]

# Active datasets
datasets = datasets[datasets["status"] == "active"]

# Rename 
datasets = datasets.rename(columns={"NumberOfSymbolicFeatures": "NumberOfCategoricalFeatures"})

# Only look at datasets with at least 5000 instances and at least 5 features
datasets = datasets[datasets["NumberOfInstances"] >= 3000]
datasets = datasets[datasets["NumberOfFeatures"] >= 5]

# Datasetws with max 100k instances and 25 features
datasets = datasets[datasets["NumberOfInstances"] <= 100000]
datasets = datasets[datasets["NumberOfFeatures"] <= 25]

# drop some corrupted datasets
datasets = datasets[~datasets["NumberOfClasses"].isna()]

# Can't work with sparse data
datasets["format"] = [_get_data_description_by_id(id, None)['format'] for id in datasets["did"]]
datasets = datasets[datasets["format"] != "Sparse_ARFF"]

# drop some unused columns
datasets = datasets.drop(columns=[
    "tid", "ttid", "task_type", "estimation_procedure", "evaluation_measures",
    "cost_matrix", "MaxNominalAttDistinctValues", "status", "target_value",
    "NumberOfMissingValues", "target_feature", "source_data", "number_samples",
    "source_data_labeled", "target_feature_event", "target_feature_left",
    "target_feature_right", "quality_measure", "NumberOfInstancesWithMissingValues", "format"
])

# drop duplicated datasets and explicitly if the have the same name
datasets = datasets.drop_duplicates()
datasets = datasets.drop_duplicates(["name"])

### Regression datasets

Regression datasets are datasets with `0` classes.

In [4]:
regression = datasets[datasets["NumberOfClasses"] == 0].copy()

regression = regression.drop_duplicates(["NumberOfFeatures", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"])

# Sort
regression["number_of_values"] = regression["NumberOfFeatures"] * regression["NumberOfInstances"]
regression = regression.sort_values(["number_of_values"])

# drop some unused columns
regression = regression.drop(columns=["MajorityClassSize", "MinorityClassSize", "NumberOfClasses", "number_of_values"])

In [5]:
# filter out duplicates by hand
drop = [227, 42635, 1414, 42092]
regression = regression[~regression["did"].isin(drop)]

In [6]:
regression = regression.reset_index(drop=True)
regression[:50]

Unnamed: 0,did,name,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,42545,stock_fardamento02,7.0,6277.0,6.0,1.0
1,42675,auml_eml_1_d,11.0,4585.0,11.0,0.0
2,198,delta_elevators,7.0,9517.0,7.0,0.0
3,23515,sulfur,7.0,10081.0,7.0,0.0
4,189,kin8nm,9.0,8192.0,9.0,0.0
5,287,wine_quality,12.0,6497.0,12.0,0.0
6,42636,Long,20.0,4477.0,20.0,0.0
7,42688,Brazilian_houses,13.0,10692.0,9.0,4.0
8,42183,dataset_sales,15.0,10738.0,15.0,0.0
9,1199,BNG(echoMonths),10.0,17496.0,7.0,3.0


### Classification datasets

#### Binary Classification datasets

Binary Classification datasets are datasets with `2` classes.

In [7]:
binary_classification = datasets[datasets["NumberOfClasses"] == 2].copy()

binary_classification = binary_classification.drop_duplicates(["MajorityClassSize", "MinorityClassSize", "NumberOfFeatures", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"])

# Sort
binary_classification["number_of_values"] = binary_classification["NumberOfFeatures"] * binary_classification["NumberOfInstances"]
binary_classification = binary_classification.sort_values(["number_of_values"])

# drop some unused columns
binary_classification = binary_classification.drop(columns=["NumberOfClasses", "number_of_values"])

In [8]:
# filter out duplicates by hand
drop = [
    41865, 41862, 41859, 41856, 41851, 41842, 41870, 41838,
    41835, 41833, 41832, 41843, 41871, 41878, 41877, 41831,
    41879, 41881, 41883, 41884, 41885, 41886, 41888, 41889,
    41891, 41893, 41896, 41898, 41873, 41828, 41825, 41767,
    41709, 41712, 41715, 41718, 41723, 41727, 41734, 41736,
    41739, 41742, 41758, 41759, 41762, 41763, 41827, 41780,
    41773, 41824, 41820, 41816, 41806, 41805, 41804, 41768,
    41799, 41787, 41782, 41781, 41779, 41777, 41792, 821, 42178, 41860
]
binary_classification = binary_classification[~binary_classification["did"].isin(drop)]

In [9]:
binary_classification = binary_classification.reset_index(drop=True)
binary_classification[:50]

Unnamed: 0,did,name,MajorityClassSize,MinorityClassSize,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,737,space_ga,1566.0,1541.0,7.0,3107.0,6.0,1.0
1,871,pollen,1924.0,1924.0,6.0,3848.0,5.0,1.0
2,40983,wilt,4578.0,261.0,6.0,4839.0,5.0,1.0
3,728,analcatdata_supreme,3081.0,971.0,8.0,4052.0,7.0,1.0
4,1489,phoneme,3818.0,1586.0,6.0,5404.0,5.0,1.0
5,803,delta_ailerons,3783.0,3346.0,6.0,7129.0,5.0,1.0
6,923,visualizing_soil,4753.0,3888.0,5.0,8641.0,3.0,2.0
7,725,bank8FM,4885.0,3307.0,9.0,8192.0,8.0,1.0
8,42192,compas-two-years,2795.0,2483.0,14.0,5278.0,7.0,7.0
9,1558,bank-marketing,4000.0,521.0,17.0,4521.0,7.0,10.0


#### Multiclass Classification datasets

Multiclass Classification datasets are datasets with more than `2` classes.

In [10]:
multiclass_classification = datasets[datasets["NumberOfClasses"] > 2].copy()

multiclass_classification = multiclass_classification.drop_duplicates(["MajorityClassSize", "MinorityClassSize", "NumberOfFeatures", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"])

# Sort
multiclass_classification["number_of_values"] = multiclass_classification["NumberOfFeatures"] * multiclass_classification["NumberOfInstances"]
multiclass_classification = multiclass_classification.sort_values(["number_of_values"])

# drop some unused columns
multiclass_classification = multiclass_classification.drop(columns=["number_of_values"])

In [11]:
# filter out duplicates by hand
drop = [119, 40678, 1222, 255]
multiclass_classification = multiclass_classification[~multiclass_classification["did"].isin(drop)]

In [12]:
multiclass_classification = multiclass_classification.reset_index(drop=True)
multiclass_classification[:50]

Unnamed: 0,did,name,MajorityClassSize,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfNumericFeatures,NumberOfCategoricalFeatures
0,1526,wall-robot-navigation,2205.0,328.0,4.0,5.0,5456.0,4.0,1.0
1,183,abalone,689.0,1.0,28.0,9.0,4177.0,7.0,2.0
2,40498,wine-quality-white,2198.0,5.0,7.0,12.0,4898.0,11.0,1.0
3,30,page-blocks,4913.0,28.0,5.0,11.0,5473.0,10.0,1.0
4,40677,led24,337.0,296.0,10.0,25.0,3200.0,0.0,25.0
5,1459,artificial-characters,1416.0,600.0,10.0,8.0,10218.0,7.0,1.0
6,40497,thyroid-ann,3488.0,93.0,3.0,22.0,3772.0,21.0,1.0
7,4552,BachChoralHarmony,503.0,1.0,102.0,17.0,5665.0,2.0,15.0
8,26,nursery,4320.0,2.0,5.0,9.0,12960.0,0.0,9.0
9,375,JapaneseVowels,1614.0,782.0,9.0,15.0,9961.0,14.0,1.0


In [14]:
from pathlib import Path

assert (len(regression) + len(multiclass_classification) + len(binary_classification)) == 69

In [15]:
regression_for_paper = regression.copy()
regression_for_paper = regression_for_paper[["did", "name", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]]
regression_for_paper[["NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]] = regression_for_paper[["NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]].astype(int)
regression_for_paper = regression_for_paper.rename(columns={
    "did": "OpenML ID",
    "name": "Name",
    "NumberOfInstances": "# Instances",
    "NumberOfNumericFeatures": "# Num. Features",
    "NumberOfCategoricalFeatures": "# Cat. Features"
})

regression_table = regression_for_paper.to_latex(
    index=False,
    caption="Regression datasets.",
    label="tab:regression_data"
)

Path("../paper/tables/regression_table.tex").write_text(regression_table)

2140

In [16]:
binary_classification_for_paper = binary_classification.copy()
binary_classification_for_paper = binary_classification_for_paper[["did", "name", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]]
binary_classification_for_paper[["NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]] = binary_classification_for_paper[["NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]].astype(int)
binary_classification_for_paper = binary_classification_for_paper.rename(columns={
    "did": "OpenML ID",
    "name": "Name",
    "NumberOfInstances": "# Instances",
    "NumberOfNumericFeatures": "# Num. Features",
    "NumberOfCategoricalFeatures": "# Cat. Features"
})

bibinary_classification_table = binary_classification_for_paper.to_latex(
    index=False,
    caption="Binary classification datasets.",
    label="tab:binary_data"
)

Path("../paper/tables/binary_table.tex").write_text(bibinary_classification_table)

3384

In [17]:
multiclass_classification_for_paper = multiclass_classification.copy()
multiclass_classification_for_paper = multiclass_classification_for_paper[["did", "name", "NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]]
multiclass_classification_for_paper[["NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]] = multiclass_classification_for_paper[["NumberOfInstances", "NumberOfNumericFeatures", "NumberOfCategoricalFeatures"]].astype(int)
multiclass_classification_for_paper = multiclass_classification_for_paper.rename(columns={
    "did": "OpenML ID",
    "name": "Name",
    "NumberOfInstances": "# Instances",
    "NumberOfNumericFeatures": "# Num. Features",
    "NumberOfCategoricalFeatures": "# Cat. Features"
})

multiclass_classification_table = multiclass_classification_for_paper.to_latex(
    index=False,
    caption="Multiclass classification datasets.",
    label="tab:multiclass_data"
)

Path("../paper/tables/multiclass_table.tex").write_text(multiclass_classification_table)

2130