In [1]:
%reload_ext autoreload
%autoreload 2

# import warnings
# warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.options.plotting.backend = 'holoviews'

from sklearn.utils import parallel_backend
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.model_selection import GridSearchCV, LeavePOut, LeaveOneOut, cross_validate, KFold

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
)
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import TweedieRegressor

from sklearn.metrics import r2_score

import prepare_data
from components import PCOA
#from helpers import PipelineHelper, SMWrapper
from settings import Config, shortnames, target
from plots import scatter_chart
from geo import get_wwtp_influence

## Data preparation

In [2]:
from geo import get_BAW_traces
from geo import tracer_sedimentation_points
from geo import get_schlei
from shapely.geometry import Point, LineString

In [None]:
%%prun
tracks = get_BAW_traces('../data/BAW_tracer_simulations.zip')

In [None]:
trac = tracks.copy()

In [None]:
i=0
for group_name, group in tracks.groupby(['simPartID', 'season', 'tracer_ESD']):
    i+=1
    group_before = group.copy()
    last_valid_x = group.loc[tracks.contact_count == Config.arrest_on_nth_sedimentation, 'geometry'].values[0].x
    last_valid_y = group.loc[tracks.contact_count == Config.arrest_on_nth_sedimentation, 'geometry'].values[0].y
    group.loc[tracks.contact_count >= Config.arrest_on_nth_sedimentation, 'geometry'] = Point(last_valid_x, last_valid_y)
    # tracks.loc[group.index] = group

In [None]:
tracks.sort_index()

In [None]:
from geo import load_zipped_grid
dem = load_zipped_grid('../data/DGM_Schlei_1982_bis_2002_UTM32.zip')

In [None]:
polygon = get_schlei()

In [None]:
track_lines = tracks.groupby(['season', 'tracer_ESD', 'simPartID'])['geometry'].apply(lambda x: LineString(x.tolist()*2 if len(x.tolist())==1 else x.tolist()))

In [None]:
track_lines.head(1).plot()

In [None]:
tracks.loc[(tracks.season=='summer') & (tracks.tracer_ESD==300) & (tracks.simPartID==898)]['geometry'].tolist()*2

In [None]:
tracks.sample(20000).explore()

In [None]:
# %%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()

In [None]:
## Additional variable generation (e.g. predictor derivatives)
 
# sdd_iow['Dist_WWTP_revsq'] = ((1/sdd_iow['Dist_WWTP'])**3)*10000000000  # calculates the squared of the reversed Distance 
# sdd_iow['Dist_WWTP_revsq'] = (((sdd_iow['Dist_WWTP'].max()-sdd_iow['Dist_WWTP'])+1)**3)/100000000000  # calculates the squared of the reversed Distance 
# sdd_iow['Dist_WWTP_revsq'] = ((sdd_iow['Dist_WWTP'].max()/sdd_iow['Dist_WWTP'])**3)/100  # calculates the squared of the reversed Distance
# sdd_iow['Dist_WWTP_revsq'] = ((sdd_iow['Dist_WWTP'].max()/sdd_iow['Dist_WWTP'])**2)  # calculates the squared of the reversed Distance

# sdd_iow

In [None]:
## Split data into samples used for building the model and samples used for predicting.

model_data = sdd_iow.loc[~sdd_iow.Concentration.isna()].set_index('Sample')
pred_data = sdd_iow.loc[sdd_iow.Concentration.isna()]
pred_data = pd.concat([pred_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

In [None]:
## Potential outlier exclusion
#model_data = model_data.drop(['S08','S10d','S05','S32'])

In [None]:
## List of features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.

featurelist = [
    # 'Depth',
    # 'LON', 'LAT',
    # 'Dist_Land',
    # 'Dist_Marina',
    # 'Dist_WWTP',
    # 'WWTP_influence_as_tracer_mean_dist',
    'WWTP_influence_as_cumulated_residence',
    # 'WWTP_influence_as_mean_time_travelled',
    # 'Dist_WWTP2',
    # 'Dist_WWTP_revsq',
    # 'MODE 1 (µm)',
    # 'D10 (µm)',
    'D50 (µm)',
    # 'D90 (µm)',
    # 'perc GRAVEL',
    # 'perc SAND',
    # 'perc MUD',
    # 'perc CLAY',
    # 'OM_D50',
    # 'TOC',
    # 'Hg',
    # 'TIC',
    # 'regio_sep',
    # 'PC1',
    'PC2'
    ]
model_X = model_data[featurelist]
model_y = model_data[target]
pred_X = pred_data[featurelist]

In [None]:
## Mutual exclusive list (list of lists detailing predictors that are not allowed together in one model candidate)

mutual_exclusive = [
    ['D50 (µm)','PC1','perc MUD'],
    ['Dist_WWTP','Dist_WWTP2'],
    ['TOC', 'perc MUD'],
]

In [None]:
## Check some basic statistics of the target variable

# model_y.describe()
# model_y.hist()
# model_X.info()


## Model building

### Custom preprocessing functions to be used in the model pipeline

In [None]:
## Create exhaustive feature selector, using leave-p-out on columns labels to generate a boolean matrix.

min_features = 3  # minimum number of features to be used in the model

feature_candidates_list = []
for i in range(1,len(model_X.columns)+1-min_features):
    lpo = LeavePOut(p=i)
    # base_index = [False] * len(model_X.columns)
    for candidate_indices, _ in lpo.split(model_X.columns):
        feature_candidates = model_X.columns[candidate_indices]
        if any(all(pd.Series(ex_feats).isin(feature_candidates)) for ex_feats in mutual_exclusive):
            continue  # if all entries of any row in mutual_exclusive list are present in the current feature_candidates, then don't put them in the feature_candidates_list
        feature_candidates_list.append(feature_candidates)
feature_candidates_list.append(model_X.columns)  # also append the set of all possible features

def SelectFeatures(model_X, feature_set, feature_sets=feature_candidates_list):
    return model_X.loc[:, feature_sets[feature_set]]

CustomFeatureSelector = FunctionTransformer(SelectFeatures, feature_names_out='one-to-one')

In [None]:
# scale the data
# scaler = StandardScaler()
# model_X = pd.DataFrame(scaler.fit_transform(model_X), columns=model_X.columns, index=model_X.index)

### Creating the model pipeline

In [None]:
## Pipeline and parameter grid for model selection, see here for inspiration: https://towardsdatascience.com/getting-the-most-out-of-scikit-learn-pipelines-c2afc4410f1a

PreProcessor = ColumnTransformer([
      ('selector', CustomFeatureSelector, model_X.columns),
      # ('imputer', SimpleImputer(), make_column_selector(dtype_include=np.number)),
      ('scaler', StandardScaler(), make_column_selector(dtype_include=np.number)),
      # ('encoder', OneHotEncoder(), make_column_selector(dtype_include=object)),
      ])

pipe = Pipeline(steps=[
    ('preprocessor', PreProcessor),
    ('regressor', DummyRegressor())
    ])

preprocessor_params = {
    'preprocessor__selector': [CustomFeatureSelector],
      'preprocessor__selector__kw_args': [{'feature_set':i} for i in range(len(feature_candidates_list))],
    
    # 'preprocessor__scaler': [StandardScaler()],#MaxAbsScaler(), MinMaxScaler(), RobustScaler(), QuantileTransformer(), Normalizer()],
    #    'preprocessor__scaler__with_mean': [True, False],
    #    'preprocessor__scaler__with_std': [True, False],
    }

regressor_params = {
    # 'regressor': [DummyRegressor()],
    #     'regressor__strategy': ['mean', 'median'],
    
    # 'regressor': [SVR()],
    #    'regressor__C': [0.1, 1.5],
    #    'regressor__kernel': ['linear', 'rbf', 'poly'],
    #    'regressor__degree': [2, 3, 4, 5],

    'regressor': [TweedieRegressor(max_iter=1000)],
      'regressor__power': [2],
      # 'regressor__power': [0, 1, 1.25, 1.5, 1.6, 1.7, 1.8, 1.9, 1.95, 1.99, 2, 3],
      'regressor__alpha': [1], 
      'regressor__link': ['log'],#, 'identity', 'auto'],
       #'regressor__fit_params__sample_weights': [None, model_data.loc[model_X.index, 'Mass'].to_numpy()]  # FIXME: fit_params seem not to be accepted from gridsearch params, only as argument in fit method directly...
  

    # 'regressor': [RadiusNeighborsRegressor()],
    #     'regressor__radius': [100000, 200000],
    #     'regressor__weights': ['uniform', 'distance'],
    #     'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #     'regressor__leaf_size': [10, 20, 30, 40, 50],
    # ,
    
    # 'regressor': [RandomForestRegressor()],
    #      'regressor__n_estimators': [10, 50], #[20, 50, 100, 150],
        #  'regressor__max_features': [None, 'sqrt', 'log2'],
        #  'regressor__min_samples_split': [2, 10],
        #  'regressor__min_samples_leaf': [1, 5],
        #  'regressor__bootstrap': [True],
        #  'regressor__max_depth': [None, 5, 50],
    #     #  'regressor__warm_start': [True, False]
    # 
}

params = {**preprocessor_params, **regressor_params}

### Training the model

In [None]:
## The pipeline is run by searching the provided paramter space using scorings of a crossvalidation technique to find out how each model candidate performs.

# Number of random trials
# NUM_TRIALS = 1

# Arrays to store scores
# nested_cvs = np.zeros(NUM_TRIALS)
scoring = ['neg_mean_absolute_error']  # possibilities: ‘neg_root_mean_squared_error’, ‘neg_mean_squared_error’, 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_log_error'

# Loop for each trial
# for i in range(NUM_TRIALS):
with parallel_backend('loky', n_jobs=-1):'):
    innerCV = GridSearchCV(
        pipe,
        params,
        scoring= scoring,
        refit= scoring[0],
        cv=LeaveOneOut(),
        verbose=1,
        # n_jobs=-1
        )
    #grid.fit(model_X, model_y)#, regressor__fit_params={'sample_weight': model_data.loc[model_X.index, 'Mass'].to_numpy()})
    outerCV = cross_validate(
        innerCV,
        model_X,
        model_y,
        scoring=scoring,
        cv=8,
        return_train_score=True,
        return_estimator=True,
        verbose=1,
        # n_jobs=-1
        )
    # nested_cvs[i] = nested_cv.mean()

## Evaluating the model

In [None]:
# make a df of all outer cv results and show it sorted by the best scoring metric
outerCV_df = pd.DataFrame(outerCV)
outerCV_df.sort_values(by=f'test_{scoring[0]}', ascending=False, inplace=True)
outerCV_df.rename_axis(index='outerCV_fold', inplace=True)

## Get best model params for each of the outer cv folds:
best_params_df = pd.DataFrame()
for i, model in enumerate(outerCV['estimator']):
    best_params = model.best_params_
    # best_params_df = pd.concat([best_params_df, pd.DataFrame(best_params, index=[i])])  # this does not work when RandomForestRegressor is used, because some internals call len() on the values of the best_params dict, which raises AttributeError: 'RandomForestRegressor' object has no attribute 'estimators_'
    # instead filling df with for-loop...:
    current_best_params_df = pd.DataFrame()
    for key, value in best_params.items():
        current_best_params_df[key] = [value]
        current_best_params_df.index = [i]
    best_params_df = pd.concat([best_params_df, current_best_params_df])

outerCV_df.join(best_params_df)

In [None]:
outerCV_df.estimator[0].predict(model_X)

In [None]:
# get inner cv results of the outer cv fold which achieved the best scoring metric
innerCV_df = pd.DataFrame(outerCV_df.loc[outerCV_df.test_neg_mean_absolute_error.idxmax(), 'estimator'].cv_results_)
innerCV_df.sort_values(by='rank_test_neg_mean_absolute_error', ascending=True)

In [None]:
## Printing score of the best performing model candidate and its parameters.

print(f'{scoring}: {outerCV["estimator"][0].score(model_X, model_y)}')
print(outerCV['estimator'][0].best_params_)

In [None]:
outerCV['estimator'][0].best_estimator_.named_steps['preprocessor'].get_feature_names_out()

In [None]:
## Extracting feature names
# [grid.best_estimator_.named_steps['preprocessor'].named_transformers_['selector'].get_feature_names_out(input_features=model_X.columns.tolist())]

In [None]:
## Show how the model performs on the training data

train_pred_y = grid.predict(model_X)  # use the best model to predict the data on the same samples that were used to train the model
print(f'R² = {r2_score(model_y, train_pred_y)}')  # adjusted R² = 1 - (1 - R²) * (n - 1) / (n - p) with n = number of samples, p = number of features

df = pd.concat([
    model_data.loc[model_y.index].regio_sep,
    model_y,
    pd.Series(
        train_pred_y,
        name='Prediction',
        index=model_y.index)
        ],
    axis=1
    ).reset_index()
    
scatter_chart(df, target, 'Prediction', color='regio_sep', labels='Sample', identity=True, equal_axes=False)[0]

In [None]:
## Take a look at all model candidates and their performance

scores = pd.DataFrame(grid.cv_results_)
# scores