In [None]:
%reload_ext autoreload
%autoreload 2

# import warnings
# warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
pd.options.plotting.backend = 'holoviews'
from tqdm import tqdm

import sklearn
print(f'sklearn verion: {sklearn.__version__}')
from sklearn.utils import parallel_backend
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.model_selection import GridSearchCV, LeavePOut, LeaveOneOut, cross_validate, KFold, PredefinedSplit

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
)
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import TweedieRegressor

from sklearn.metrics import r2_score

from sklearn import set_config
# set_config(transform_output='pandas')  # only works for sklearn >= 1.2

import prepare_data
from components import PCOA
#from helpers import PipelineHelper, SMWrapper
from settings import Config, shortnames, target
from plots import scatter_chart
from cv import generate_feature_sets, best_median_score

## Data preparation

In [None]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()

In [None]:
## Split data into samples used for building the model and samples used for predicting.

model_data = sdd_iow.loc[~sdd_iow.Concentration.isna()].set_index('Sample')
pred_data = sdd_iow.loc[sdd_iow.Concentration.isna()]
pred_data = pd.concat([pred_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

In [None]:
## List of features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.

featurelist = [
    'Depth',
    # 'LON', 'LAT',
    'Dist_Land',
    # 'Dist_Marina',
    'Dist_WWTP',
    # 'WWTP_influence_as_tracer_mean_dist',
    # 'WWTP_influence_as_cumulated_residence',
    # 'WWTP_influence_as_mean_time_travelled',
    'WWTP_influence_as_tracer_mean_dist__sed_18µm_allseasons_444',
    'WWTP_influence_as_endpoints_mean_dist__sed_18µm_allseasons_444',
    'WWTP_influence_as_cumulated_residence__sed_18µm_allseasons_444',
    'WWTP_influence_as_mean_time_travelled__sed_18µm_allseasons_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_18µm_spring_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_spring_444',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_spring_444',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_spring_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_spring_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_spring_444',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_spring_444',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_spring_444',
    'WWTP_influence_as_tracer_mean_dist__sed_18µm_autumn_222',
    'WWTP_influence_as_endpoints_mean_dist__sed_18µm_autumn_222',
    'WWTP_influence_as_cumulated_residence__sed_18µm_autumn_222',
    'WWTP_influence_as_mean_time_travelled__sed_18µm_autumn_222',
    'WWTP_influence_as_tracer_mean_dist__sed_18µm_autumn_444',
    'WWTP_influence_as_endpoints_mean_dist__sed_18µm_autumn_444',
    'WWTP_influence_as_cumulated_residence__sed_18µm_autumn_444',
    'WWTP_influence_as_mean_time_travelled__sed_18µm_autumn_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_18µm_autumn_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_autumn_222',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_autumn_222',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_autumn_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_autumn_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_autumn_222',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_autumn_222',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_autumn_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_allseasons_444',            # *
    'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_allseasons_444',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_allseasons_444',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_allseasons_444',         # *
    'WWTP_influence_as_tracer_mean_dist__sed_allsizes_allseasons_444',              # *
    'WWTP_influence_as_endpoints_mean_dist__sed_allsizes_allseasons_444',
    'WWTP_influence_as_cumulated_residence__sed_allsizes_allseasons_444',
    'WWTP_influence_as_mean_time_travelled__sed_allsizes_allseasons_444',           # *
    'WWTP_influence_as_tracer_mean_dist__sed_18µm_allseasons_222',
    'WWTP_influence_as_endpoints_mean_dist__sed_18µm_allseasons_222',
    'WWTP_influence_as_cumulated_residence__sed_18µm_allseasons_222',
    'WWTP_influence_as_mean_time_travelled__sed_18µm_allseasons_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_18µm_allseasons_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_allseasons_222',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_allseasons_222',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_allseasons_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_18µm_spring_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_spring_222',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_spring_222',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_spring_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_allseasons_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_allseasons_222',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_allseasons_222',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_allseasons_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_autumn_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_autumn_444',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_autumn_444',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_autumn_444',
    'WWTP_influence_as_tracer_mean_dist__sed_18µm_spring_444',
    'WWTP_influence_as_endpoints_mean_dist__sed_18µm_spring_444',
    'WWTP_influence_as_cumulated_residence__sed_18µm_spring_444',
    'WWTP_influence_as_mean_time_travelled__sed_18µm_spring_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_18µm_autumn_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_autumn_444',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_autumn_444',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_autumn_444',
    'WWTP_influence_as_tracer_mean_dist__sed_18µm_spring_222',
    'WWTP_influence_as_endpoints_mean_dist__sed_18µm_spring_222',
    'WWTP_influence_as_cumulated_residence__sed_18µm_spring_222',
    'WWTP_influence_as_mean_time_travelled__sed_18µm_spring_222',
    'WWTP_influence_as_tracer_mean_dist__sed_allsizes_allseasons_222',
    'WWTP_influence_as_endpoints_mean_dist__sed_allsizes_allseasons_222',
    'WWTP_influence_as_cumulated_residence__sed_allsizes_allseasons_222',
    'WWTP_influence_as_mean_time_travelled__sed_allsizes_allseasons_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_spring_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_spring_222',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_spring_222',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_spring_222',
    # 'Dist_WWTP2',
    # 'Dist_WWTP_revsq',
    'MODE 1 (µm)',
    # 'D10 (µm)',
    'D50 (µm)',
    # 'D90 (µm)',
    # 'perc GRAVEL',
    # 'perc SAND',
    'perc MUD',
    # 'perc CLAY',
    # 'OM_D50',
    'TOC',
    # 'Hg',
    # 'TIC',
    # 'regio_sep',
    'PC1',
    'PC2'
    ]
model_X = model_data[featurelist]
model_y = model_data[target]
pred_X = pred_data[featurelist]

In [None]:
# Workaround for creating long candidate lists faster...

featurelist_simplified = [
    'Depth',
    'Dist_Land',
    'WWTP',  # just put one placeholder "WWTP" instead of all the WWTP features
    'MODE 1 (µm)',
    'D50 (µm)',
    'perc MUD',
    'TOC',
    'PC1',
    'PC2'
]

min_num, max_num = 2, 5
feature_candidates_list_simpl = generate_feature_sets(featurelist_simplified, Config.mutual_exclusive, Config.exclusive_keywords, num_feat=(min_num, max_num), n_jobs=1, save=False)

new_feature_candidates_list = [l for l in feature_candidates_list_simpl if 'WWTP' not in l]

for i, lst in enumerate(feature_candidates_list_simpl):
    if 'WWTP' in lst:
        # print(i, lst)
        for j, feat in enumerate(featurelist):
            if 'WWTP' in feat:
                lst = lst.copy()
                lst[[idx for idx, s in enumerate(lst) if 'WWTP' in s][0]] = feat
                new_feature_candidates_list.append(lst)

import pickle
with open(f'../data/feature_candidates_list_min{min_num}_max{max_num}.pkl', 'wb') as f:
    pickle.dump(new_feature_candidates_list, f)