In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.options.plotting.backend = 'holoviews'

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

import prepare_data
from components import PCOA
#from helpers import PipelineHelper, SMWrapper
from settings import Config, shortnames, target
from plots import scatter_chart
from geo import get_wwtp_influence

## Data preparation

In [2]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()

## Additional variable generation (e.g. predictor derivatives)
 
# sdd_iow['Dist_WWTP_revsq'] = ((1/sdd_iow['Dist_WWTP'])**3)*10000000000  # calculates the squared of the reversed Distance 
# sdd_iow['Dist_WWTP_revsq'] = (((sdd_iow['Dist_WWTP'].max()-sdd_iow['Dist_WWTP'])+1)**3)/100000000000  # calculates the squared of the reversed Distance 
# sdd_iow['Dist_WWTP_revsq'] = ((sdd_iow['Dist_WWTP'].max()/sdd_iow['Dist_WWTP'])**3)/100  # calculates the squared of the reversed Distance
sdd_iow['Dist_WWTP_revsq'] = ((sdd_iow['Dist_WWTP'].max()/sdd_iow['Dist_WWTP'])**2)  # calculates the squared of the reversed Distance

## Split data into samples used for building the model and samples used for predicting.

model_data = sdd_iow.loc[~sdd_iow.Concentration.isna()].set_index('Sample')
pred_data = sdd_iow.loc[sdd_iow.Concentration.isna()]
pred_data = pd.concat([pred_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

## Potential outlier exclusion
#model_data = model_data.drop(['S08','S10d','S05','S32'])

## List of features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.

featurelist = [
    'Depth',
    # 'LON', 'LAT',
    'Dist_Land',
    'Dist_Marina',
    'Dist_WWTP',
    'WWTP_influence_as_tracer_mean_dist',
    'WWTP_influence_as_cumulated_residence',
    # 'WWTP_influence_as_mean_time_travelled',
    # 'Dist_WWTP2',
    #'Dist_WWTP_revsq',
    # 'MODE 1 (µm)',
    # 'D10 (µm)',
    'D50 (µm)',
    # 'D90 (µm)',
    # 'perc GRAVEL',
    # 'perc SAND',
    'perc MUD',
    # 'perc CLAY',
    # 'OM_D50',
    'TOC',
    # 'Hg',
    # 'TIC',
    # 'regio_sep',
    'PC1', 'PC2'
    ]
model_X = model_data[featurelist]
model_y = model_data[target]
pred_X = pred_data[featurelist]

X_train, X_test, y_train, y_test = train_test_split(model_X, model_y, train_size=0.75, test_size=0.25)

In [3]:
from autosklearn.regression import AutoSklearnRegressor

# if file exists at "/tmp/autosklearn_resampling_example_tmp": delete it
from pathlib import Path
tmp_folder = Path("/tmp/autosklearn_resampling_example_tmp")
if tmp_folder.exists():
    import shutil
    shutil.rmtree(tmp_folder)

automl = AutoSklearnRegressor(
    time_left_for_this_task=24000,
    per_run_time_limit=600,
    tmp_folder="/tmp/autosklearn_resampling_example_tmp",
    disable_evaluator_output=False,
    n_jobs=-1,
    resampling_strategy="cv",
    resampling_strategy_arguments={"folds": 10},
)
automl.fit(X_train, y_train, dataset_name="Schlei_sediments")

AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                     n_jobs=-1, per_run_time_limit=600,
                     resampling_strategy='cv',
                     resampling_strategy_arguments={'folds': 10},
                     time_left_for_this_task=24000,
                     tmp_folder='/tmp/autosklearn_resampling_example_tmp')

In [5]:
print("Before re-fit")
automl.refit(X_train, y_train)
predictions = automl.predict(X_test)
print("R2 score:", r2_score(y_test, predictions))


Before re-fit
R2 score: -0.23227335636538582


In [6]:

print("After re-fit")
automl.refit(model_X.copy(), model_y.copy())
predictions_all = automl.predict(model_X)
print("R2 score:", r2_score(model_y, predictions_all))

After re-fit
R2 score: 0.9554340735373072


In [7]:

print(automl.show_models())

{136: {'model_id': 136, 'rank': 1, 'cost': 33.7452815634619, 'ensemble_weight': 0.14, 'voting_model': VotingRegressor(estimators=None), 'estimators': [{'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f66d5b6fd60>, 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f66d5a6d370>, 'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0x7f666e9c29a0>, 'sklearn_regressor': MLPRegressor(alpha=0.0015901429576245345, beta_1=0.999, beta_2=0.9,
             hidden_layer_sizes=(57, 57, 57),
             learning_rate_init=0.030415503815134217, max_iter=64,
             n_iter_no_change=32, random_state=1, validation_fraction=0.0,
             verbose=0, warm_start=True)}, {'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f666ecf5160>, 'feature_preprocessor': <autosklearn.pipeline.com

In [8]:

# summarize
print(automl.sprint_statistics())
# evaluate best model
pd.DataFrame(automl.cv_results_)

auto-sklearn results:
  Dataset name: Schlei_sediments
  Metric: r2
  Best validation score: -0.236316
  Number of target algorithm runs: 3974
  Number of successful target algorithm runs: 2055
  Number of crashed target algorithm runs: 1890
  Number of target algorithms that exceeded the time limit: 29
  Number of target algorithms that exceeded the memory limit: 0



Unnamed: 0,mean_test_score,rank_test_scores,mean_fit_time,params,status,budgets,param_data_preprocessor:__choice__,param_feature_preprocessor:__choice__,param_regressor:__choice__,param_data_preprocessor:feature_type:numerical_transformer:imputation:strategy,...,param_regressor:gradient_boosting:n_iter_no_change,param_regressor:gradient_boosting:validation_fraction,param_regressor:libsvm_svr:coef0,param_regressor:libsvm_svr:degree,param_regressor:libsvm_svr:gamma,param_regressor:mlp:validation_fraction,param_regressor:sgd:epsilon,param_regressor:sgd:eta0,param_regressor:sgd:l1_ratio,param_regressor:sgd:power_t
0,-22.259696,2239,10.963022,{'data_preprocessor:__choice__': 'feature_type...,Success,0.0,feature_type,no_preprocessing,random_forest,mean,...,,,,,,,,,,
1,-45.604559,2625,5.225044,{'data_preprocessor:__choice__': 'feature_type...,Success,0.0,feature_type,no_preprocessing,gaussian_process,median,...,,,,,,,,,,
2,-11.347622,2062,4.801776,{'data_preprocessor:__choice__': 'feature_type...,Success,0.0,feature_type,polynomial,gaussian_process,most_frequent,...,,,,,,,,,,
3,-8.076222,2018,4.992872,{'data_preprocessor:__choice__': 'feature_type...,Success,0.0,feature_type,polynomial,gaussian_process,most_frequent,...,,,,,,,,,,
4,-5.010014,1959,5.654037,{'data_preprocessor:__choice__': 'feature_type...,Success,0.0,feature_type,polynomial,gaussian_process,most_frequent,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3969,0.000000,1,0.761444,{'data_preprocessor:__choice__': 'feature_type...,Crash,0.0,feature_type,select_rates_regression,sgd,mean,...,,,,,,,0.055052,0.010000,,0.25
3970,0.000000,1,28.098570,{'data_preprocessor:__choice__': 'feature_type...,Timeout,0.0,feature_type,select_percentile_regression,libsvm_svr,most_frequent,...,,,0.277936,4.0,0.016437,,,,,
3971,0.000000,1,2.643786,{'data_preprocessor:__choice__': 'feature_type...,Crash,0.0,feature_type,select_rates_regression,sgd,most_frequent,...,,,,,,,0.067388,0.005043,,0.25
3972,-30.907470,2382,3.732392,{'data_preprocessor:__choice__': 'feature_type...,Success,0.0,feature_type,no_preprocessing,liblinear_svr,most_frequent,...,,,,,,,,,,


In [9]:
automl.leaderboard(detailed=True)

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration,config_id,train_loss,seed,start_time,end_time,budget,status,data_preprocessors,feature_preprocessors,balancing_strategy,config_origin
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2486,1,0.54,mlp,1.236316,5.000706,2485,0.016215,0,1665244000.0,1665244000.0,0.0,StatusType.SUCCESS,[],[extra_trees_preproc_for_regression],,Random Search
3019,2,0.2,libsvm_svr,2.871434,4.798249,3018,0.123338,0,1665247000.0,1665247000.0,0.0,StatusType.SUCCESS,[],[extra_trees_preproc_for_regression],,Random Search
136,3,0.14,mlp,33.745282,4.726459,135,0.277917,0,1665231000.0,1665231000.0,0.0,StatusType.SUCCESS,[],[select_rates_regression],,Random Search
688,4,0.12,ard_regression,34.542165,4.997399,687,0.017737,0,1665234000.0,1665234000.0,0.0,StatusType.SUCCESS,[],[polynomial],,Random Search


In [None]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)

In [None]:
print(tpot.score(X_test, y_test))
preds = tpot.predict(X_test)
print(f'R²: {r2_score(y_test, preds)}')

tpot.export('../plots/tpot_pipeline.py')