In [9]:
import openml
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [10]:
if False:
    warnings.filterwarnings("ignore")
    
    # List all datasets and their properties
    datasets_openml = openml.datasets.list_datasets(output_format="dataframe")
    print(datasets_openml.shape)
    # Get small datasets with more than 10 features
    selec_datasets = datasets_openml.query('NumberOfInstances>1000 and NumberOfInstances<30000 \
        and NumberOfFeatures>10 and NumberOfFeatures<500')
    print(selec_datasets.shape)

    res_datasets = []
    for name_ds in tqdm(selec_datasets.name):
        try:
            # Get dataset by name
            dataset = openml.datasets.get_dataset(name_ds)
            # Get the data itself as a dataframe (or otherwise)
            X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
            if y is not None:
                # print(name_ds, y.dtypes, len(y), len(np.unique(y)))
                res_datasets.append(dict(id_ds=dataset.dataset_id,
                                        name_ds=name_ds, 
                                        nrows=X.shape[0],
                                        len_y=len(y),
                                        type_y=y.dtypes,
                                        num_classes=len(np.unique(y)),
                                        target_name=dataset.default_target_attribute))
        except:
            continue
    res_datasets = pd.DataFrame(res_datasets)
    res_datasets.to_csv('./input/res_datasets.csv', index=False)
else:
    res_datasets = pd.read_csv('./input/res_datasets.csv')

print(res_datasets.shape)
 
res_selec = res_datasets.query('type_y!="object" and type_y!="Sparse[float64, 0]" and len_y>2000 and type_y!="int64" and nrows<40000')
print(res_selec.shape)

res_selec = res_selec.drop_duplicates()
print(res_selec.shape)

# Remove datasets with similar name
selec_names = []
pos_selec = []
for nrow, name_ds in enumerate(res_selec.name_ds.values):
    three = name_ds[:3]
    if three not in selec_names:
        selec_names.append(three)
        pos_selec.append(nrow)
res_selec = res_selec.iloc[pos_selec]
print(res_selec.shape) 
print(res_selec.head())  

(911, 7)
(300, 7)
(158, 7)
(117, 7)
   id_ds      name_ds  nrows  len_y    type_y  num_classes target_name
0      3     kr-vs-kp   3196   3196  category            2       class
1      6       letter  20000  20000  category           26       class
7     24     mushroom   8124   8124  category            2       class
8     28    optdigits   5620   5620  category           10       class
9     30  page-blocks   5473   5473  category            5       class


In [11]:
warnings.filterwarnings("ignore")
# Inputs
# array(['bool', 'category', 'float64', 'int64', 'object', 'uint8'],

# Outputs
# array(['bool', 'category', 'float64', 'uint8'], dtype='<U8')

scaler = StandardScaler()
label_enc = LabelEncoder()
res_basedata = []
for nrow, row in tqdm(res_selec.iterrows(), total=len(res_selec)):
    # try:
    # Get dataset by name
    dataset = openml.datasets.get_dataset(row['name_ds'])
    # Get the data itself as a dataframe (or otherwise)
    X, y, _, _ = dataset.get_data(dataset_format="dataframe", target=dataset.default_target_attribute)
    assert y.isna().sum()==0
    
    # Transform Inputs
    # ----------------
    X_prim = []
    for namecol in X.columns:
        # Convert numeric to float64
        if X[namecol].dtypes==np.uint8 or X[namecol].dtypes==np.int64 or X[namecol].dtypes==np.float64 or X[namecol].dtypes==bool:
            X[namecol] = X[namecol].astype(np.float64)
            X[namecol] = X[namecol].fillna(X[namecol].mean())
            X[namecol] = scaler.fit_transform(X[namecol].values.reshape(-1,1)).flatten()
            X_prim.append(X[namecol])
        if str(X[namecol].dtypes)=='category' or str(X[namecol].dtypes)=='object':
            X_prim.append(pd.get_dummies(X[namecol], prefix=namecol).astype(np.float32))
    X_prim = pd.concat(X_prim, axis=1)
    
    # Transform Target
    # --------------
    if row['type_y'] == 'float64':
        y.fillna(y.mean(), inplace=True)
        y_values = y.values.flatten()
        y_values = scaler.fit_transform(y_values.reshape(-1,1)).flatten()
        type_prob = 'regression'
    if row['type_y'] == 'uint8':
        y = y.astype(np.int32)
        y_values = y.values.flatten()
        type_prob = 'binary' if row['num_classes']==2 else 'multiclass'
        
    if row['type_y'] == 'bool':
        y = y.astype(np.int32)
        y_values = y.values.flatten()
        type_prob = 'binary' if row['num_classes']==2 else 'multiclass'
    if row['type_y'] == 'category':
        y_values = label_enc.fit_transform(y)
        type_prob = 'binary' if row['num_classes']==2 else 'multiclass'
    
    name_file = row['name_ds'].replace('-','_').split('_')[0]
    print(name_file)
    
    # Include more 
    row['name_file'] = name_file + '.csv'
    row['type_prob'] = type_prob
    row['NFs'] = X_prim.shape[1]
    X_prim['target_end'] = y_values
    res_basedata.append(row)
    
    if type_prob == 'regression':
        print(row['name_ds'], X.shape, X_prim.shape, type_prob, X_prim['target_end'].mean(), X_prim['target_end'].std())
    if type_prob == 'binary' or type_prob == 'multiclass':
        print(row['name_ds'], X.shape, X_prim.shape, type_prob, X_prim['target_end'].nunique())
    X_prim.to_csv('./datasets/' + row['name_file'], index=False)

res_basedata = pd.DataFrame(res_basedata)
res_basedata.to_csv('./input/res_basedata.csv', index=False)

  0%|          | 0/117 [00:00<?, ?it/s]

kr
kr-vs-kp (3196, 36) (3196, 75) binary 2
letter
letter (20000, 16) (20000, 17) multiclass 26
mushroom
mushroom (8124, 22) (8124, 126) binary 2
optdigits
optdigits (5620, 64) (5620, 65) multiclass 10
page
page-blocks (5473, 10) (5473, 11) multiclass 5
pendigits
pendigits (10992, 16) (10992, 17) multiclass 10
segment
segment (2310, 19) (2310, 20) multiclass 7
sick
sick (3772, 29) (3772, 53) binary 2
spambase
spambase (4601, 57) (4601, 58) binary 2




splice
splice (3190, 60) (3190, 288) multiclass 3
hypothyroid
hypothyroid (3772, 29) (3772, 53) multiclass 4
waveform
waveform-5000 (5000, 40) (5000, 41) multiclass 3
satimage
satimage (6430, 36) (6430, 37) multiclass 6
cpu
cpu_act (8192, 21) (8192, 22) multiclass 56
pol
pol (15000, 48) (15000, 49) multiclass 11
elevators
elevators (16599, 18) (16599, 19) regression -9.237612047534769e-16 1.0000301236576215
wine
wine_quality (6497, 11) (6497, 12) multiclass 7
Ailerons
Ailerons (13750, 40) (13750, 41) regression -1.0748573748225515e-16 1.000036365619955
coil2000
coil2000 (9822, 85) (9822, 86) binary 2
ozone
ozone_level (2536, 72) (2536, 22709) binary 2
puma32H
puma32H (8192, 32) (8192, 33) regression 8.673617379884035e-18 1.0000610407447539
scene
scene (2407, 299) (2407, 305) binary 2
yeast
yeast_ml8 (2417, 116) (2417, 130) binary 2
internet
internet_usage (10108, 71) (10108, 10653) multiclass 46
JapaneseVowels
JapaneseVowels (9961, 14) (9961, 15) multiclass 9
ipums
ipums_la_99-small (8



MagicTelescope
MagicTelescope (19020, 10) (19020, 11) binary 2
Kaggle
Kaggle_bike_sharing_demand_challange (10886, 10) (10886, 48) regression -6.266039190976448e-17 1.0000459337176697
cardiotocography
cardiotocography (2126, 35) (2126, 36) multiclass 10
eeg
eeg-eye-state (14980, 14) (14980, 15) binary 2
first
first-order-theorem-proving (6118, 51) (6118, 52) multiclass 6
gas
gas-drift (13910, 128) (13910, 129) multiclass 6
ringnorm
ringnorm (7400, 20) (7400, 21) binary 2
wall
wall-robot-navigation (5456, 24) (5456, 25) multiclass 4
twonorm
twonorm (7400, 20) (7400, 21) binary 2
autoUniv
autoUniv-au4-2500 (2500, 100) (2500, 227) multiclass 3
CreditCardSubset
CreditCardSubset (14240, 30) (14240, 31) binary 2
PhishingWebsites
PhishingWebsites (11055, 30) (11055, 69) binary 2
GesturePhaseSegmentationProcessed
GesturePhaseSegmentationProcessed (9873, 32) (9873, 33) multiclass 5
BachChoralHarmony
BachChoralHarmony (5665, 16) (5665, 105) multiclass 102
thyroid
thyroid-allbp (2800, 26) (2800, 

In [12]:
res_basedata

Unnamed: 0,id_ds,name_ds,nrows,len_y,type_y,num_classes,target_name,name_file,type_prob,NFs
0,3,kr-vs-kp,3196,3196,category,2,class,kr.csv,binary,74
1,6,letter,20000,20000,category,26,class,letter.csv,multiclass,16
7,24,mushroom,8124,8124,category,2,class,mushroom.csv,binary,125
8,28,optdigits,5620,5620,category,10,class,optdigits.csv,multiclass,64
9,30,page-blocks,5473,5473,category,5,class,page.csv,multiclass,10
...,...,...,...,...,...,...,...,...,...,...
806,45062,shrutime,10000,10000,uint8,2,class,shrutime.csv,binary,28
807,45064,Insurance,23548,23548,uint8,2,class,Insurance.csv,binary,73
900,45536,Contaminant-detection-in-packaged-cocoa-hazeln...,2400,2400,uint8,2,class,Contaminant.csv,binary,30
905,45553,FICO-HELOC-cleaned,9871,9871,category,2,RiskPerformance,FICO.csv,binary,39
