In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('..')

In [4]:
import json

import pandas as pd
from apricot import FeatureBasedSelection
from pmlb import fetch_data
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split

from src.utils import *

In [11]:
pd.set_option("display.max_rows", 100)

In [5]:
pmlb_data = pd.read_csv("../metadata/Penn Machine Learning Benchmarks.csv")
# remove feynman datasets (119)
pmlb_data = pmlb_data[~pmlb_data["Dataset"].str.contains("feynman")]
# remove fri_c datasets (60)
pmlb_data = pmlb_data[~pmlb_data["Dataset"].str.contains("fri_c")]
# drop Metadata column
pmlb_data = pmlb_data.drop("Metadata", axis=1)
pmlb_data = pmlb_data.reset_index(drop=True)
# set categorical with n_classes == 2 to binary
pmlb_data.loc[pmlb_data["n_classes"] == 2, "Endpoint"] = "binary"

In [6]:
pmlb_data

Unnamed: 0,Dataset,n_observations,n_features,n_classes,Endpoint,Imbalance,Task
0,adult,48842,14,2.0,binary,0.27,classification
1,agaricus_lepiota,8145,22,2.0,binary,0.00,classification
2,allbp,3772,29,3.0,categorical,0.88,classification
3,allhyper,3771,29,4.0,categorical,0.93,classification
4,allhypo,3770,29,3.0,categorical,0.78,classification
...,...,...,...,...,...,...,...
231,strogatz_shearflow1,400,2,,continuous,0.00,regression
232,strogatz_shearflow2,400,2,,continuous,0.00,regression
233,strogatz_vdp1,400,2,,continuous,0.00,regression
234,strogatz_vdp2,400,2,,continuous,0.00,regression


In [7]:
pmlb_data["feat_n_continuous"] = 0
pmlb_data["feat_n_categorical"] = 0
pmlb_data["feat_n_nominal"] = 0
pmlb_data["feat_n_binary"] = 0
pmlb_data["feat_n_ordinal"] = 0

for idx, dset in enumerate(pmlb_data["Dataset"].tolist()):
    m = load_metadata(dset)
    feats = {}
    for f in m["features"]:
        feats[f"feat_n_{f['type']}"] = feats.get(f"feat_n_{f['type']}", 0) + 1
    pmlb_data.loc[idx, feats.keys()] = feats.values()

# add nominal to categorical and remove categorical
pmlb_data["feat_n_categorical"] += pmlb_data["feat_n_nominal"]
pmlb_data = pmlb_data.drop(["feat_n_nominal"], axis=1)

In [18]:
pmlb_data[(pmlb_data["n_observations"] > 10_000) & (pmlb_data["n_observations"] < 500_000) & (pmlb_data["Task"] == "classification")]

Unnamed: 0,Dataset,n_observations,n_features,n_classes,Endpoint,Imbalance,Task,feat_n_continuous,feat_n_categorical,feat_n_binary,feat_n_ordinal
0,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
51,connect_4,67557,42,3.0,categorical,0.25,classification,0,42,0,0
62,fars,100968,29,8.0,categorical,0.16,classification,14,15,0,0
89,kddcup,494020,41,23.0,categorical,0.38,classification,28,9,4,0
91,krkopt,28056,6,18.0,categorical,0.05,classification,0,6,0,0
95,letter,20000,16,26.0,categorical,0.0,classification,16,0,0,0
98,magic,19020,10,2.0,binary,0.09,classification,10,0,0,0
105,mnist,70000,784,10.0,categorical,0.0,classification,654,111,19,0
115,nursery,12958,8,4.0,categorical,0.09,classification,0,7,1,0
120,pendigits,10992,16,10.0,categorical,0.0,classification,16,0,0,0


In [19]:
result_df = pd.read_json("../results/results.json")

In [28]:
result_df.merge(pmlb_data, left_on="dataset", right_on="Dataset")

Unnamed: 0,dataset,fraction,score,most_prevalent_cls,least_prevalent_cls,function,optimizer,Dataset,n_observations,n_features,n_classes,Endpoint,Imbalance,Task,feat_n_continuous,feat_n_categorical,feat_n_binary,feat_n_ordinal
0,adult,1.0,0.800262,0.760718,0.239282,,,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
1,adult,0.001,0.259029,0.583333,0.416667,featurebased,naive,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
2,adult,0.001,0.721235,0.722222,0.277778,facilitylocation,naive,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
3,adult,0.0025,0.785767,0.516484,0.483516,featurebased,naive,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
4,adult,0.0025,0.799116,0.67033,0.32967,facilitylocation,naive,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
5,adult,0.001,0.259029,0.583333,0.416667,featurebased,lazy,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
6,adult,0.001,0.721235,0.722222,0.277778,facilitylocation,lazy,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
7,adult,0.0025,0.785767,0.516484,0.483516,featurebased,lazy,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
8,adult,0.0025,0.799116,0.67033,0.32967,facilitylocation,lazy,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1
9,adult,0.001,0.382524,0.638889,0.361111,featurebased,stochastic,adult,48842,14,2.0,binary,0.27,classification,6,6,1,1


In [29]:
# adult    : some continuous, some categorical --> big difference
# connect_4: all categorical                   --> small difference
# krkopt   : all categorical                   --> small difference
# letter   : all continuous                    --> pretty big difference
# nursery  : all categorical                   --> not that big of a difference
# pendigits: all continuous                    --> pretty big difference
# shuttle  : all continuous                    --> pretty big difference
# sleep    : all continuous                    --> not that big of a difference

In [None]:
# time it takes to create the similarity matrix vs time it takes to execute the facility location
# what types of preprocessing work well to get the feature based function to work? for people who are new to submodular optimization
# run for all datasets, look at the biggest difference between feature based and facility location and pick these to try out different preprocessing steps (e.g. GMMs)
#     - mix of features
#     - pixels
#     - tabular data where feature distribution is meaningful

# take a random subset in with the same size as the apricot

# 1. two toy data sets: images (mnist, fashion mnist), other type of data
# old CV ways of feature extraction, use pretrained network to help (we can't guarantee that their output is non-negative)
# 2. apply ideas to large corpus of datasets
# 3. look at different optimizers

# Stratified K-fold: split dataset by classes, run SO on each class 

In [249]:
dataset = "shuttle"
m = load_metadata(dataset)
X = fetch_data(dataset, local_cache_dir="../datasets")
categorical_fs = [f["name"] for f in m["features"] if f["type"] == "categorical"]
continuous_fs = [f["name"] for f in m["features"] if f["type"] == "continuous"]


In [250]:
# OHE categorical features
X = one_hot_encode_df(X, columns=categorical_fs)
# Standardize continuous features
X.loc[:, continuous_fs] = normalize_df(X, columns=continuous_fs)

In [251]:
y = X["target"]
X = X.drop("target", axis=1)

In [252]:
X.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9
0,0.232323,0.489289,0.4375,0.507015,0.346154,0.638117,0.490196,0.64366,0.607717
1,0.282828,0.487167,0.554688,0.507015,0.301282,0.638737,0.54902,0.714286,0.662379
2,0.262626,0.487167,0.476562,0.507015,0.384615,0.637997,0.503268,0.614767,0.575563
3,0.10101,0.487167,0.429688,0.507015,0.346154,0.638546,0.575163,0.64366,0.585209
4,0.10101,0.487167,0.453125,0.507015,0.355769,0.637496,0.594771,0.640449,0.575563


In [253]:
train_X, test_X, train_y, test_y = train_test_split(X.values, y.values)

In [254]:
fb_select1 = FeatureBasedSelection(n_samples=len(train_X)*0.1, optimizer="naive", verbose=True)
fb_select1.fit(train_X)

100%|██████████| 4.35k/4.35k [00:19<00:00, 222it/s]


<apricot.functions.featureBased.FeatureBasedSelection at 0x1501e10a0>

In [230]:
train_X_subset = train_X[fb_select1.ranking, :]
train_y_subset = train_y[fb_select1.ranking]

In [232]:
# model = LinearRegression()
model = LogisticRegression(max_iter=1000)

model_f = model
model_f.fit(train_X, train_y)
s_f = model_f.score(test_X, test_y)

model_s = model
model_s.fit(train_X_subset, train_y_subset)
s_s = model_s.score(test_X, test_y)

In [233]:
s_f, s_s

(0.09147355099967025, -0.03792401286276714)

In [10]:
pmlb_data.to_csv("../metadata/pmlb_data_processed.csv", index=None)