In [47]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [96]:
import sys
sys.path.append('..')

In [268]:
import json

import pandas as pd
from apricot import FeatureBasedSelection
from pmlb import fetch_data
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split

from src.utils import *

In [42]:
pmlb_data = pd.read_csv("../metadata/Penn Machine Learning Benchmarks.csv")
# remove feynman datasets (119)
pmlb_data = pmlb_data[~pmlb_data["Dataset"].str.contains("feynman")]
# remove fri_c datasets (60)
pmlb_data = pmlb_data[~pmlb_data["Dataset"].str.contains("fri_c")]
# drop Metadata column
pmlb_data = pmlb_data.drop("Metadata", axis=1)
pmlb_data = pmlb_data.reset_index(drop=True)
# set categorical with n_classes == 2 to binary
pmlb_data.loc[pmlb_data["n_classes"] == 2, "Endpoint"] = "binary"

In [43]:
pmlb_data

Unnamed: 0,Dataset,n_observations,n_features,n_classes,Endpoint,Imbalance,Task
0,adult,48842,14,2.0,binary,0.27,classification
1,agaricus_lepiota,8145,22,2.0,binary,0.00,classification
2,allbp,3772,29,3.0,categorical,0.88,classification
3,allhyper,3771,29,4.0,categorical,0.93,classification
4,allhypo,3770,29,3.0,categorical,0.78,classification
...,...,...,...,...,...,...,...
231,strogatz_shearflow1,400,2,,continuous,0.00,regression
232,strogatz_shearflow2,400,2,,continuous,0.00,regression
233,strogatz_vdp1,400,2,,continuous,0.00,regression
234,strogatz_vdp2,400,2,,continuous,0.00,regression


In [44]:
pmlb_data["feat_n_continuous"] = 0
pmlb_data["feat_n_categorical"] = 0
pmlb_data["feat_n_nominal"] = 0
pmlb_data["feat_n_binary"] = 0
pmlb_data["feat_n_ordinal"] = 0

for idx, dset in enumerate(pmlb_data["Dataset"].tolist()):
    m = load_metadata(dset)
    feats = {}
    for f in m["features"]:
        feats[f"feat_n_{f['type']}"] = feats.get(f"feat_n_{f['type']}", 0) + 1
    pmlb_data.loc[idx, feats.keys()] = feats.values()

# add nominal to categorical and remove categorical
pmlb_data["feat_n_categorical"] += pmlb_data["feat_n_nominal"]
pmlb_data = pmlb_data.drop(["feat_n_nominal"], axis=1)

In [273]:
pmlb_data.sort_values("n_observations").tail(20)

Unnamed: 0,Dataset,n_observations,n_features,n_classes,Endpoint,Imbalance,Task,feat_n_continuous,feat_n_categorical,feat_n_binary,feat_n_ordinal
209,574_house_16H,22784,16,,continuous,0.02,regression,16,0,0,0
182,218_house_8L,22784,8,,continuous,0.02,regression,8,0,0,0
91,krkopt,28056,6,18.0,categorical,0.05,classification,0,6,0,0
169,1193_BNG_lowbwt,31104,9,,continuous,0.0,regression,2,3,4,0
181,215_2dplanes,40768,10,,continuous,0.0,regression,0,9,1,0
189,344_mv,40768,10,,continuous,0.0,regression,7,1,2,0
207,564_fried,40768,10,,continuous,0.0,regression,10,0,0,0
0,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
135,shuttle,58000,9,7.0,categorical,0.59,classification,9,0,0,0
51,connect_4,67557,42,3.0,categorical,0.25,classification,0,42,0,0


In [274]:
pd.read_json("../results.json")

Unnamed: 0,dataset,fraction,score,most_prevalent_cls,least_prevalent_cls,optimizer
0,adult,1.0,0.844403,0.760718,0.239282,
1,adult,0.001,0.793629,0.638889,0.361111,naive
2,adult,0.001,0.793629,0.638889,0.361111,lazy
3,adult,0.001,0.781918,0.805556,0.194444,stochastic
4,connect_4,1.0,0.758792,0.658303,0.09546,
5,connect_4,0.001,0.608822,0.62,0.08,naive
6,connect_4,0.001,0.594375,0.64,0.14,lazy
7,connect_4,0.001,0.643991,0.72,0.1,stochastic
8,fars,1.0,0.78405,0.417122,8.9e-05,
9,fars,0.001,0.481103,0.666667,0.026667,naive


In [249]:
dataset = "shuttle"
m = load_metadata(dataset)
X = fetch_data(dataset, local_cache_dir="../datasets")
categorical_fs = [f["name"] for f in m["features"] if f["type"] == "categorical"]
continuous_fs = [f["name"] for f in m["features"] if f["type"] == "continuous"]


In [250]:
# OHE categorical features
X = one_hot_encode_df(X, columns=categorical_fs)
# Standardize continuous features
X.loc[:, continuous_fs] = normalize_df(X, columns=continuous_fs)

In [251]:
y = X["target"]
X = X.drop("target", axis=1)

In [252]:
X.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9
0,0.232323,0.489289,0.4375,0.507015,0.346154,0.638117,0.490196,0.64366,0.607717
1,0.282828,0.487167,0.554688,0.507015,0.301282,0.638737,0.54902,0.714286,0.662379
2,0.262626,0.487167,0.476562,0.507015,0.384615,0.637997,0.503268,0.614767,0.575563
3,0.10101,0.487167,0.429688,0.507015,0.346154,0.638546,0.575163,0.64366,0.585209
4,0.10101,0.487167,0.453125,0.507015,0.355769,0.637496,0.594771,0.640449,0.575563


In [253]:
train_X, test_X, train_y, test_y = train_test_split(X.values, y.values)

In [254]:
fb_select1 = FeatureBasedSelection(n_samples=len(train_X)*0.1, optimizer="naive", verbose=True)
fb_select1.fit(train_X)

100%|██████████| 4.35k/4.35k [00:19<00:00, 222it/s]


<apricot.functions.featureBased.FeatureBasedSelection at 0x1501e10a0>

In [230]:
train_X_subset = train_X[fb_select1.ranking, :]
train_y_subset = train_y[fb_select1.ranking]

In [232]:
# model = LinearRegression()
model = LogisticRegression(max_iter=1000)

model_f = model
model_f.fit(train_X, train_y)
s_f = model_f.score(test_X, test_y)

model_s = model
model_s.fit(train_X_subset, train_y_subset)
s_s = model_s.score(test_X, test_y)

In [233]:
s_f, s_s

(0.09147355099967025, -0.03792401286276714)

In [10]:
pmlb_data.to_csv("../metadata/pmlb_data_processed.csv", index=None)