In [1]:
import pandas as pd
import numpy as np

# get data metafeatures
metafeatures = pd.read_csv('../metafeatures/data_metafeatures.csv',sep=',',index_col=0)
print('loaded ', metafeatures.shape[1]-1, ' metafeatures for ', metafeatures.shape[0], ' datasets')
# get ML results
data = pd.read_csv('sklearn-benchmark5-data.tsv.gz', sep='\t', names=['dataset',
                                                                     'classifier',
                                                                     'parameters',
                                                                     'accuracy', 
                                                                     'macrof1',
                                                                     'bal_accuracy']).fillna('')

data['accuracy'] = data['accuracy'].apply(lambda x: round(x, 3))
print('loaded ',data['dataset'].unique().shape[0],'datasets and ', data['classifier'].unique().shape[0],'classifiers')
# subset data to classifiers used in PennAI
pennai_classifiers = ['LogisticRegression', 'RandomForestClassifier', 'SVC', 
                      'KNeighborsClassifier', 'DecisionTreeClassifier', 'GradientBoostingClassifier']
mask = np.array([c in pennai_classifiers for c in data['classifier'].values])
data = data.loc[mask,:]
print('classifiers:',data['classifier'].unique())

loaded  44  metafeatures for  166  datasets
loaded  166 datasets and  14 classifiers
classifiers: ['RandomForestClassifier' 'SVC' 'KNeighborsClassifier'
 'GradientBoostingClassifier' 'LogisticRegression' 'DecisionTreeClassifier']


# can we predict which learner will be best on a dataset from dataset properties?

In [None]:
# get best classifier for each dataset
from tqdm import tqdm 
best_method = dict()
for i,(dataset, group_data) in enumerate(tqdm(data.groupby('dataset'))):
    best_method[dataset] = group_data['classifier'][np.argmax(group_data['accuracy'])]

# print(best_method)

# make new dataset combining metafeatures and best methods
y = np.empty(metafeatures.shape[0])
methods = data['classifier'].unique()
print('methods:',methods)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# le.fit(methods)

print('metafeatures[''dataset''].shape:',metafeatures['dataset'].shape)
y_str = [best_method[ds] for ds in metafeatures['dataset'].values]

y = le.fit_transform(y_str)

metaf = metafeatures.dropna(axis=1,how='all')
metaf.fillna(value=0,axis=1,inplace=True)
print(metafeatures.shape[1]-metaf.shape[1],' features dropped due to missing values')
# print(metaf[:10])
from sklearn.preprocessing import StandardScaler, Normalizer

X = Normalizer().fit_transform(metaf.drop('dataset',axis=1).values)
print('X shape:',X.shape)
print('y shape:',y.shape)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, LeaveOneOut
from sklearn.model_selection import LeaveOneOut
# X_t,X_v,y_t,y_v = train_test_split(X,y)

# dtc = DecisionTreeClassifier()
dtc = RandomForestClassifier(n_estimators=1000)
# dtc = KNeighborsClassifier(n_neighbors=1)
# dtc.fit(X_t,y_t)
cv = StratifiedShuffleSplit(n_splits=50,test_size=0.1)
print('fitting model...')
# print('mean CV score:',np.mean(cross_val_score(dtc,X,y,cv=LeaveOneOut())))
print('mean CV score:',np.mean(cross_val_score(dtc,X,y,cv=cv)))

100%|██████████| 166/166 [00:00<00:00, 1109.28it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


methods: ['RandomForestClassifier' 'SVC' 'KNeighborsClassifier'
 'GradientBoostingClassifier' 'LogisticRegression' 'DecisionTreeClassifier']
metafeatures[dataset].shape: (166,)
9  features dropped due to missing values
X shape: (166, 35)
y shape: (166,)
fitting model...


# can we predict the best score achievable on a dataset from dataset properties?

In [14]:
# get best classifier for each dataset
from tqdm import tqdm 
best_score = dict()


# print(best_method)

# make new dataset combining metafeatures and best methods
y = np.empty(metafeatures.shape[0])
for i,(dataset, group_data) in enumerate(tqdm(data.groupby('dataset'))):
   y[i] = group_data['bal_accuracy'].max()

print('metafeatures[''dataset''].shape:',metafeatures['dataset'].shape)


metaf = metafeatures.dropna(axis=1,how='all')
metaf.fillna(value=0,axis=1,inplace=True)

print(metafeatures.shape[1]-metaf.shape[1],' features dropped due to missing values')
# print(metaf[:10])

from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(metaf.drop('dataset',axis=1).values)
print('X shape:',X.shape)
print('y shape:',y.shape)

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import cross_val_score

# X_t,X_v,y_t,y_v = train_test_split(X,y)

# est = DecisionTreeClassifier()
est = RandomForestRegressor(n_estimators=100)
# est = LassoLarsCV()
# dtc.fit(X_t,y_t)
print('fitting model...')
print('mean CV score:',np.mean(cross_val_score(est,X,y,cv=5)))

100%|██████████| 166/166 [00:00<00:00, 366.18it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


metafeatures[dataset].shape: (166,)
9  features dropped due to missing values
X shape: (166, 35)
y shape: (166,)
fitting model...
mean CV score: -0.194007105864


