In [57]:
import yaml
import pandas as pd

In [58]:
with open('../configs/rumc_hyperopt.yaml', 'r') as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)

In [59]:
config

{'dataset': {'name': 'rumc',
  'target': 'CORADS',
  'evaluation': {'n_folds': 5, 'metrics': ['corads_roc_auc']}},
 'preprocessing': {'imputation': {'categorical': 'CONSTANT',
   'numeric': 'MEAN'},
  'feature_selection': {'fs_type': 'SFM', 'num_features': 1100}},
 'model_parameters': {'boosting_type': {'distribution': 'CATEGORICAL',
   'values': ['dart', 'gbdt']},
  'n_estimators': {'distribution': 'RANDINT', 'vmin': 2, 'vmax': 60},
  'learning_rate': {'distribution': 'LOG_UNIFORM',
   'vmin': '1e-5',
   'vmax': '5e-1'},
  'max_depth': 4}}

In [81]:
from marshmallow import Schema, fields, validate
from dataclasses import field
from marshmallow_dataclass import dataclass, NewType
from typing import Any, Dict, Optional, List, Union
from enum import Enum
import sklearn
from sklearn import metrics

class DatasetTarget(Enum):
    PCR = 'pcr'
    CORADS = 'corads'
    DIAGNOSIS = 'diagnosis'

    @property
    def is_classification(self):
        if self == DatasetTarget.PCR or self == DatasetTarget.DIAGNOSIS:
            return True
        return False

class FeatureSelectionType(Enum):
    SFM = 'sfm'
    RFE = 'rfe'


class ImputationType(Enum):
    MEAN = 'mean'
    MEDIAN = 'median'
    CONSTANT = 'constant'

    def generate_imputer(self):
        from sklearn.impute import SimpleImputer
        if self == ImputationType.MEAN:
            return SimpleImputer(strategy='mean')
        elif self == ImputationType.MEDIAN:
            return SimpleImputer(strategy='median')
        elif self == ImputationType.CONSTANT:
            return SimpleImputer(strategy='constant', fill_value='missing')
        raise ValueError(f'Invalid config: {self.value}')


Metric = NewType('Metric', str, fields.String, required=True,
                 validate=validate.OneOf(['thresh_accuracy', 'corads_roc_auc'] + list(metrics.SCORERS.keys())))


@dataclass
class EvaluationConfig:
    n_folds: int = field(metadata={'validate': validate.Range(min=2)})
    metrics: List[Metric]


@dataclass
class DatasetConfig(Schema):
    name: str
    target: DatasetTarget
    evaluation: EvaluationConfig


@dataclass
class ImputationConfig(Schema):
    categorical: ImputationType
    numeric: ImputationType


@dataclass
class FeatureSelectionConfig(Schema):
    fs_type: FeatureSelectionType
    num_features: Optional[int]
    use_clinical_features: bool = field(default=True)
    use_visual_features: bool = field(default=True)

    def generate_selector(self):
        from lightgbm import LGBMClassifier
        if self.fs_type == FeatureSelectionType.RFE:
            from sklearn.feature_selection import RFE
            return RFE(LGBMClassifier(n_jobs=-1), n_features_to_select=self.num_features)
        elif self.fs_type == FeatureSelectionType.SFM:
            from sklearn.feature_selection import SelectFromModel
            return SelectFromModel(LGBMClassifier(n_jobs=-1), max_features=self.num_features)
        raise ValueError(f'Invalid config: {self.fs_type}')


@dataclass
class PreprocessingConfig(Schema):
    imputation: ImputationConfig
    feature_selection: FeatureSelectionConfig

Categorical = NewType('Categorical', str, fields.String, required=True, validate=validate.Equal('CATEGORICAL'))
        
@dataclass
class CategoricalDistribution:
    distribution: Categorical
    values: Union[List[str], List[int], List[float]]
        
RandInt = NewType('Categorical', str, fields.String, required=True, validate=validate.Equal('RANDINT'))
        
@dataclass
class RandIntDistribution:
    distribution: RandInt
    vmin: int
    vmax: int
        
LogUniform = NewType('Categorical', str, fields.String, required=True, validate=validate.Equal('LOG_UNIFORM'))
        
@dataclass
class LogUniformDistribution:
    distribution: LogUniform
    vmin: float
    vmax: float

@dataclass
class Config:
    dataset: DatasetConfig
    preprocessing: PreprocessingConfig
    model_parameters: Dict[str, Union[str, int, float, CategoricalDistribution, RandIntDistribution, LogUniformDistribution]]


ConfigSchema = Config.Schema()

In [82]:
config_parsed = Config.Schema().load(config)

In [83]:
config

{'dataset': {'name': 'rumc',
  'target': 'CORADS',
  'evaluation': {'n_folds': 5, 'metrics': ['corads_roc_auc']}},
 'preprocessing': {'imputation': {'categorical': 'CONSTANT',
   'numeric': 'MEAN'},
  'feature_selection': {'fs_type': 'SFM', 'num_features': 1100}},
 'model_parameters': {'boosting_type': {'distribution': 'CATEGORICAL',
   'values': ['dart', 'gbdt']},
  'n_estimators': {'distribution': 'RANDINT', 'vmin': 2, 'vmax': 60},
  'learning_rate': {'distribution': 'LOG_UNIFORM',
   'vmin': '1e-5',
   'vmax': '5e-1'},
  'max_depth': 4}}

In [84]:
config_parsed

Config(dataset=DatasetConfig(name='rumc', target=<DatasetTarget.CORADS: 'corads'>, evaluation=EvaluationConfig(n_folds=5, metrics=['corads_roc_auc'])), preprocessing=PreprocessingConfig(imputation=ImputationConfig(categorical=<ImputationType.CONSTANT: 'constant'>, numeric=<ImputationType.MEAN: 'mean'>), feature_selection=FeatureSelectionConfig(fs_type=<FeatureSelectionType.SFM: 'sfm'>, num_features=1100, use_clinical_features=True, use_visual_features=True)), model_parameters={'boosting_type': CategoricalDistribution(distribution='CATEGORICAL', values=['dart', 'gbdt']), 'n_estimators': RandIntDistribution(distribution='RANDINT', vmin=2, vmax=60), 'learning_rate': LogUniformDistribution(distribution='LOG_UNIFORM', vmin=1e-05, vmax=0.5), 'max_depth': 4})

In [89]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.impute import SimpleImputer
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
import os
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score, get_scorer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV
import coolname
import numpy as np
import random

from scipy.stats import randint
from sklearn.utils.fixes import loguniform

from typing import Tuple

def config_params_to_search_space(config: Config):
    search_space = {}
    for k, v in config.model_parameters.items():
        value_space = None
        if isinstance(v, str) or isinstance(v, float) or isinstance(v, int):
            value_space = [v]
        elif isinstance(v, CategoricalDistribution):
            value_space = v.values
        elif isinstance(v, RandIntDistribution):
            value_space = randint(v.vmin, v.vmax+1)
        elif isinstance(v, LogUniformDistribution):
            value_space = loguniform(v.vmin, v.vmax)
        
        search_space.update({f'classifier__{k}': value_space})
    return search_space
        

def generate_pipeline_search(X: pd.DataFrame, config: Config):
    categorical_columns = list(X.select_dtypes(include=['category']).columns)
    numerical_columns = list(X.select_dtypes(exclude=['category']).columns)

    cateogrical_preprocessor = make_pipeline(
        config.preprocessing.imputation.categorical.generate_imputer(),
        OrdinalEncoder(categories=[list(X[col].cat.categories) for col in categorical_columns])
    )
    numerical_preprocessor = config.preprocessing.imputation.numeric.generate_imputer()

    imputation = ColumnTransformer([
        ('categorical', cateogrical_preprocessor, categorical_columns),
        ('numerical', numerical_preprocessor, numerical_columns)
    ])

    clf = Pipeline([
        ('imputation', imputation),
        ('feature_selection', config.preprocessing.feature_selection.generate_selector()),
        ('classifier', LGBMClassifier(n_jobs=-1) if config.dataset.target.is_classification else LGBMRegressor(
            n_jobs=-1))
    ])
    
    search = RandomizedSearchCV(clf, config_params_to_search_space(config), n_iter=4, n_jobs=-1)

    return search


def load_data(config: Config) -> Tuple[pd.DataFrame, pd.Series]:
    if config.dataset.name == 'ictcf':
        return load_ictcf()
    elif config.dataset.name == 'rumc':
        X, y = load_rumc(config.dataset.target)
        not_nan = (y != 'nan') & ~y.isna()
        return X[not_nan], y[not_nan]


def thresh_accuracy(y, y_pred):
    y_thresh = y >= 2
    y_pred_thresh = y_pred >= 2
    return accuracy_score(y_thresh, y_pred_thresh)


def corads_roc_auc(y, y_pred):
    y_thresh = np.round(y / 5)
    y_pred_thresh = y_pred / 5
    return roc_auc_score(y_thresh, y_pred_thresh)

@dataclass
class Features:
    categorical_features: List[str]
    numeric_features: List[str]
    output_feature: str
    visual_features: List[str] = field(default_factory=lambda: [])

    @property
    def input_features(self) -> List[str]:
        return self.categorical_features + self.numeric_features + self.visual_features


def load_rumc(target: DatasetTarget) -> Tuple[pd.DataFrame, pd.Series]:
    df = pd.read_pickle('../data/processed/rumc.pkl')
    features = get_rumc_features(df, target)
    return df[features.input_features], df[features.output_feature]

def get_rumc_features(df: pd.DataFrame, target: DatasetTarget) -> Features:
    non_visual_features = list(df.columns[~df.columns.isin(['pcr', 'corads', 'diagnosis', 'part']) & ~df.columns.str.startswith('vis_')])
    visual_features = list(df.columns[df.columns.str.startswith('vis_')])
    output_feature = {
        DatasetTarget.PCR: 'pcr',
        DatasetTarget.CORADS: 'corads',
        DatasetTarget.DIAGNOSIS: 'diagnosis'
    }[target]
    numerical_features = list(df[non_visual_features].select_dtypes(exclude='category').columns)
    categorical_features = list(df[non_visual_features].select_dtypes(include='category').columns)

    return Features(categorical_features, numerical_features, output_feature, visual_features)



def evaluate(config: Config):
    print('Loading data...')
    X, y = load_data(config)

    print('Generating pipeline...')
    clf = generate_pipeline_search(X, config)

    print('Evaluating pipeline on data...')
    categorical_columns = list(X.select_dtypes(include=['category']).columns)
    custom_metrics = {
        'thresh_accuracy': make_scorer(thresh_accuracy),
        'corads_roc_auc': make_scorer(corads_roc_auc)
    }
    metrics = {metric: (custom_metrics[metric] if metric in custom_metrics else get_scorer(metric)) for metric in
               config.dataset.evaluation.metrics}
    results = cross_validate(clf, X, y, scoring=metrics,
                             cv=config.dataset.evaluation.n_folds)

    model_name = coolname.generate_slug(random.choice([2, 2, 2, 2, 2, 2, 3]))

    print(f'Saving results (model name: "{model_name}")...')
    df_results = pd.DataFrame(results)
    df_results.columns = 'value_' + df_results.columns
    df_results['fold'] = df_results.index
    df_results = pd.wide_to_long(df_results, stubnames=['value'], sep='_', suffix='.+', i='fold', j='metric')
    df_results = df_results.reset_index()

    df_results['model'] = model_name
    df_results['timestamp'] = pd.Timestamp.now()

    config_dict = ConfigSchema.dump(config)
    df_config = pd.json_normalize(config_dict)
    df_config.columns = 'config.' + df_config.columns
    df_full = pd.concat([df_results, df_config.loc[[0] * len(df_results)].reset_index()], axis=1)
    return df_full
#     df_full.to_pickle(os.path.join(DATA_RESULTS_PATH, f'{model_name}.pkl'))


In [90]:
evaluate(config_parsed)

Loading data...
Generating pipeline...
Evaluating pipeline on data...
Saving results (model name: "mature-ringtail")...


Unnamed: 0,fold,metric,value,model,timestamp,index,config.preprocessing.imputation.categorical,config.preprocessing.imputation.numeric,config.preprocessing.feature_selection.use_clinical_features,config.preprocessing.feature_selection.fs_type,...,config.dataset.target,config.model_parameters.boosting_type.values,config.model_parameters.boosting_type.distribution,config.model_parameters.n_estimators.vmax,config.model_parameters.n_estimators.distribution,config.model_parameters.n_estimators.vmin,config.model_parameters.learning_rate.vmax,config.model_parameters.learning_rate.distribution,config.model_parameters.learning_rate.vmin,config.model_parameters.max_depth
0,0,fit_time,16.792036,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
1,1,fit_time,11.775002,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
2,2,fit_time,11.854,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
3,3,fit_time,12.383067,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
4,4,fit_time,13.418703,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
5,0,score_time,0.151999,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
6,1,score_time,0.145999,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
7,2,score_time,0.151002,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
8,3,score_time,0.169,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4
9,4,score_time,0.149,mature-ringtail,2021-04-05 22:26:35.258688,0,CONSTANT,MEAN,True,SFM,...,CORADS,"[dart, gbdt]",CATEGORICAL,60,RANDINT,2,0.5,LOG_UNIFORM,1e-05,4


In [93]:
print('Loading data...')
X, y = load_data(config_parsed)

print('Generating pipeline...')
clf = generate_pipeline_search(X, config_parsed)

print('Evaluating pipeline on data...')
categorical_columns = list(X.select_dtypes(include=['category']).columns)
custom_metrics = {
    'thresh_accuracy': make_scorer(thresh_accuracy),
    'corads_roc_auc': make_scorer(corads_roc_auc)
}
metrics = {metric: (custom_metrics[metric] if metric in custom_metrics else get_scorer(metric)) for metric in
           config_parsed.dataset.evaluation.metrics}
results = cross_validate(clf, X, y, scoring=metrics,
                         cv=config_parsed.dataset.evaluation.n_folds, return_estimator=True)

Loading data...
Generating pipeline...
Evaluating pipeline on data...


In [97]:
est = results['estimator'][0]

In [100]:
pd.DataFrame(est.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__boosting_type,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.207401,0.582323,0.278199,0.059788,dart,0.009518,4,15,"{'classifier__boosting_type': 'dart', 'classif...",-0.037763,0.115218,0.156725,0.056016,-0.52796,-0.047553,0.248933,4
1,4.5332,0.256253,0.2924,0.058253,gbdt,0.047156,4,33,"{'classifier__boosting_type': 'gbdt', 'classif...",0.740112,0.737782,0.749739,0.59869,0.219309,0.609126,0.202755,1
2,4.239999,0.403432,0.2262,0.018818,gbdt,0.008924,4,46,"{'classifier__boosting_type': 'gbdt', 'classif...",0.317763,0.397152,0.422385,0.313964,-0.156656,0.258922,0.212144,3
3,3.7876,0.074637,0.169999,0.027864,gbdt,0.05517,4,12,"{'classifier__boosting_type': 'gbdt', 'classif...",0.509742,0.553279,0.564098,0.443911,0.039099,0.422026,0.19607,2


In [30]:
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [180]:
from sklearn.model_selection import cross_validate
results = cross_validate(clf, X, y, scoring=config_parsed.dataset.evaluation.metrics, cv=config_parsed.dataset.evaluation.n_folds)

In [194]:
df_results = pd.DataFrame(results)
df_results.columns = 'value_' + df_results.columns
df_results['fold'] = df_results.index
df_results = pd.wide_to_long(df_results, stubnames=['value'], sep='_', suffix='.+', i='fold', j='metric')
df_results = df_results.reset_index()

In [264]:
import coolname
import random
model_name = coolname.generate_slug(random.choice([2, 2, 2, 2, 2, 2, 3]))

In [265]:
df_results['model'] = model_name
df_results['timestamp'] = pd.Timestamp.now()
df_results

Unnamed: 0,fold,metric,value,model,timestamp
0,0,fit_time,1.108999,quantum-crayfish,2021-04-02 13:59:09.187006
1,1,fit_time,1.058001,quantum-crayfish,2021-04-02 13:59:09.187006
2,2,fit_time,1.035001,quantum-crayfish,2021-04-02 13:59:09.187006
3,3,fit_time,1.151002,quantum-crayfish,2021-04-02 13:59:09.187006
4,4,fit_time,0.992002,quantum-crayfish,2021-04-02 13:59:09.187006
5,0,score_time,0.007998,quantum-crayfish,2021-04-02 13:59:09.187006
6,1,score_time,0.008,quantum-crayfish,2021-04-02 13:59:09.187006
7,2,score_time,0.009,quantum-crayfish,2021-04-02 13:59:09.187006
8,3,score_time,0.006998,quantum-crayfish,2021-04-02 13:59:09.187006
9,4,score_time,0.007998,quantum-crayfish,2021-04-02 13:59:09.187006


In [282]:
df_config = pd.json_normalize(config)
df_config.columns = 'config.' + df_config.columns
df_full = pd.concat([df_results, df_config.loc[[0]*len(df_results)].reset_index()], axis=1)
df_full

Unnamed: 0,fold,metric,value,model,timestamp,index,config.dataset.name,config.dataset.target,config.dataset.evaluation.n_folds,config.dataset.evaluation.metrics,config.preprocessing.imputation.categorical,config.preprocessing.imputation.numeric,config.preprocessing.feature_selection.fs_type,config.model_parameters.boosting_type,config.model_parameters.n_estimators,config.model_parameters.learning_rate
0,0,fit_time,1.108999,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
1,1,fit_time,1.058001,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
2,2,fit_time,1.035001,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
3,3,fit_time,1.151002,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
4,4,fit_time,0.992002,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
5,0,score_time,0.007998,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
6,1,score_time,0.008,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
7,2,score_time,0.009,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
8,3,score_time,0.006998,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01
9,4,score_time,0.007998,quantum-crayfish,2021-04-02 13:59:09.187006,0,ictcf,PCR,5,"[roc_auc, accuracy, f1_weighted]",CONSTANT,MEAN,RFE,dart,40,0.01


In [283]:
df_full.to_pickle(f'../data/results/{model_name}.pkl')

In [285]:
schema = Config.Schema()

In [286]:
schema.dump(config_parsed)

{'dataset': {'target': 'PCR',
  'name': 'ictcf',
  'evaluation': {'metrics': ['roc_auc', 'accuracy', 'f1_weighted'],
   'n_folds': 5}},
 'model_parameters': {'boosting_type': 'dart',
  'n_estimators': 40,
  'learning_rate': 0.01},
 'preprocessing': {'imputation': {'numeric': 'MEAN',
   'categorical': 'CONSTANT'},
  'feature_selection': {'fs_type': 'RFE', 'num_features': None}}}

In [232]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [138]:
!pip install coolname

Collecting coolname
  Downloading coolname-1.1.0-py2.py3-none-any.whl (35 kB)
Installing collected packages: coolname
Successfully installed coolname-1.1.0


In [130]:
!pip install "marshmallow-dataclass[enum,union]"



In [None]:
class DatasetConfig(Schema):
    name = fields.String()
    target = fields.String(validate=validate.OneOf(['pcr', 'corads']))
    

class ImputationConfig(Schema):
    categorical = fields.String(required=True)
    numeric = fields.String(required=True)

class FeatureSelectionConfig(Schema):
    fs_type = fields.String(validate=validate.OneOf(['rfe', 'sfm']))
    num_features = fields.Int(strict=True, validate=validate.Range(min=1))
    
class PreprocessingConfig(Schema):
    imputation = fields.Nested(ImputationConfig)
    feature_selection = fields.Nested(FeatureSelectionConfig)

class Config(Schema):
    dataset = fields.Nested(DatasetConfig)
    preprocessing = fields.Nested(PreprocessingConfig)
    model_parameters = fields.Dict()

In [46]:
schema = Config()
config_dumped = schema.load(config)

TypeError: __init__() got an unexpected keyword argument 'dataset'

In [49]:
from marshmallow import Schema, fields, post_load


class UserSchema(Schema):
    name = fields.Str()
    email = fields.Email()
    created_at = fields.DateTime()

    @post_load
    def make_user(self, data, **kwargs):
        return User(**data)
    
user_data = {"name": "Ronnie", "email": "ronnie@stones.com"}
schema = UserSchema()
result = schema.load(user_data)
print(result)  # => <User(name='Ronnie')>

NameError: name 'User' is not defined

In [4]:
!pip install marshmallow

Collecting marshmallow
  Downloading marshmallow-3.11.1-py2.py3-none-any.whl (46 kB)
Installing collected packages: marshmallow
Successfully installed marshmallow-3.11.1
