In [None]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_regression
from sklearn.model_selection import GridSearchCV, train_test_split, LeavePOut, LeaveOneOut

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer


from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score

import prepare_data
from components import PCOA
from helpers import PipelineHelper, SMWrapper
from settings import Config, shortnames, target
from plots import scatter_chart

## Data preparation

In [None]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()

In [None]:
# Additional variable generation (e.g. predictor derivatives), 

sdd_iow['Dist_WWTP_revsq'] = (sdd_iow['Dist_WWTP'].max()-sdd_iow['Dist_WWTP'])**2 #calculates the squared of the reversed Distance 

In [None]:
model_data = sdd_iow.loc[~sdd_iow.Concentration.isna()].set_index('Sample')
pred_data = sdd_iow.loc[sdd_iow.Concentration.isna()]
pred_data = pd.concat([pred_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

In [None]:
#potential outlier exclusion
model_data = model_data.drop(['S08','S10d','S05','S32'])

In [None]:
featurelist = [
    #'Depth',
    # 'LON', 'LAT',
    #'Dist_Land',
    # 'Dist_Marina',
    'Dist_WWTP',
    # 'Dist_WWTP2',
    #'Dist_WWTP_revsq',
    # 'MODE 1 (µm)',
    # 'D10 (µm)',
    # 'D50 (µm)',
    # 'D90 (µm)',
    # 'perc GRAVEL',
    # 'perc SAND',
    'perc MUD',
    # 'perc CLAY',
    # 'OM_D50',
    'TOC',
    # 'Hg',
    # 'TIC',
    # 'regio_sep',
    # 'PC1', 'PC2'
    ]
model_X = model_data[featurelist]
model_y = model_data[target]
pred_X = pred_data[featurelist]

In [None]:
# mutual exclusive list (list of lists detailing predictors that are not allowed together in one model candidate)

mutual_exclusive = [
    ['D50 (µm)','PC1','perc MUD'],
    ['Dist_WWTP','Dist_WWTP2'],
    [],
    [],
]

In [None]:
model_y.describe()
# model_y.hist()


## Model building

### Preprocessing

In [None]:
# cat_cols = model_X.select_dtypes(exclude=['number', 'bool']).columns.to_list()  # get categorical columns

# transformer = OneHotEncoder()
# transformed = transformer.fit_transform(model_X[cat_cols])
# columns = transformer.get_feature_names_out(cat_cols)
# transformed_df = pd.DataFrame.sparse.from_spmatrix(transformed, columns=columns)

# model_X = pd.concat([model_X.reset_index().drop(cat_cols, axis=1), transformed_df], join='inner', axis=1).set_index('Sample')

In [None]:
# create exhaustive feature selector
# using leave-p-out on columns labels to generate a boolean matrix

feature_candidates_list = []

for i in range(1,len(model_X.columns)):
    lpo = LeavePOut(p=i)
    # base_index = [False] * len(model_X.columns)
    for candidate_indices, _ in lpo.split(model_X.columns):
        feature_candidates = model_X.columns[candidate_indices]
        # TODO: perhaps provide a list of mutually exclusive features and skip appending when that's the case
        feature_candidates_list.append(feature_candidates)

def SelectFeatures(model_X, feature_set=0, feature_sets=feature_candidates_list):
    return model_X.loc[:, feature_sets[feature_set]]

CustomFeatureSelector = FunctionTransformer(SelectFeatures)

In [None]:
PreProcessor = ColumnTransformer([
      ('selector', CustomFeatureSelector, model_X.columns),
      # ('imputer', SimpleImputer(), make_column_selector(dtype_include=np.number)),
      # ('scaler', StandardScaler(), make_column_selector(dtype_include=np.number)),
      # ('encoder', OneHotEncoder(), make_column_selector(dtype_include=object)),
      ])

In [None]:
# Pipeline and parameter grid for model selection, see here for inspiration: https://towardsdatascience.com/getting-the-most-out-of-scikit-learn-pipelines-c2afc4410f1a

pipe = Pipeline([
    ('preprocessor', PreProcessor),
    ('regressor', DummyRegressor())])

params = [{
    'preprocessor__selector__kw_args': [{'feature_set':i} for i in range(len(feature_candidates_list))],
    
    # {'preprocessor__scaler': [StandardScaler()],
    #     'preprocessor__scaler__with_mean': [True, False],
    #     'preprocessor__scaler__with_std': [True, False],
    # },

    # {'preporcessor__scaler': [MaxAbsScaler()]
    # },
    # {'preporcessor__scaler': [MinMaxScaler()],
    # },

    # {'regressor': [DummyRegressor()],
    #     'regressor__strategy': ['mean', 'median']
    # },
    
    'regressor': [SVR()],
        'regressor__C': [1], #[0.1, 0.5, 1.0],
        'regressor__kernel': ['linear'], #['linear', 'rbf', 'poly'],
        'regressor__degree': [1], #[1, 2, 3, 4, 5],
    },
    # {'regressor': [RandomForestRegressor()],
    #     'regressor__n_estimators': [10, 20, 50, 100, 150],
    #     'regressor__max_features': [None, 'sqrt', 'log2'],
    #     'regressor__min_samples_split': [2, 5, 10],
    #     'regressor__min_samples_leaf': [1, 2, 5],
    #     'regressor__bootstrap': [True, False],
    #     'regressor__max_depth': [None, 5, 10, 20, 50],
    #     'regressor__warm_start': [True, False]
    # },
    ]

### Pipeline

In [None]:

# Pipeline and parameter grid for model selection using PipelineHelper

# pipe = Pipeline([
#     ('scaler', PipelineHelper([
#         ('std', StandardScaler()),
#         ('max', MaxAbsScaler()),
#     #     # ('minmax', MinMaxScaler()),
#     ],optional=False)),  # set to True to also try with no scaler activated (i.e. no scaling). Equivalent to using StandardScaler's 'with_mean' and 'with_std' set to False.

#     ('selector', PipelineHelper([
#         ('selectkbest', SelectKBest()),
#         # ('efs', EFS()),
#     ])),

#     ('regressor', PipelineHelper([
#         ('dummy', DummyRegressor()),
#         # ('glm', SMWrapper(family=Config.glm_family, formula=Config.glm_formula)),
#         ('svm', SVR()),
#         # ('rf', RandomForestRegressor()),
#         # ('ada', AdaBoostRegressor()),
#         # ('gb', GradientBoostingRegressor()),
#         # ('knn', KNeighborsRegressor()),
#         # ('rnn', RadiusNeighborsRegressor()),
#         # ('nb_pipe', Pipeline([
#             # ('scaler', MinMaxScaler()),  # Naive Bayes needs positive numbers
#             # ('nb', GaussianNB()),
#         # ])),
#     ])),
# ])

# params = {
#     'scaler__selected_model': pipe.named_steps['scaler'].generate(
#         {
#             'std__with_mean': [True, False],
#             'std__with_std': [True, False],
#             # no params for 'max' and 'minmax' leads to using standard params
#         }
#     ),
#     'selector__selected_model': pipe.named_steps['selector'].generate(
#         {
#             'selectkbest__k': [1, 2, 3, 4, 5],
#             'selectkbest__score_func': [mutual_info_regression, f_regression],
#             # 'efs__max_features': [model_X.shape[1]],
#         }
#     ),
#     'regressor__selected_model': pipe.named_steps['regressor'].generate(
#         {
#             'dummy__strategy': ['mean', 'median'],
#             # 'glm__alpha': [0.0, 0.1, 0.2, 0.5, 1.0],
#             # 'glm__L1_wt': [0.1, 0.5, 1],
#             'svm__C': [1], #[0.1, 0.5, 1.0],
#             'svm__kernel': ['linear'], #['linear', 'rbf', 'poly'],
#             'svm__degree': [1], #[1, 2, 3, 4, 5],
#             # 'rf__n_estimators': [10, 20, 50, 100, 150],
#             # 'rf__max_features': ['sqrt', 'log2', None],
#             # 'rf__min_samples_split': [2, 5, 10],
#             # 'rf__min_samples_leaf': [1, 2, 4],
#             # 'rf__bootstrap': [True, False],
#             # 'rf__max_depth': [None, 2, 5, 10],
#             # 'rf__warm_start': [True, False],
#             # 'ada__n_estimators': [10, 20, 40, 100],
#             # 'ada__learning_rate': [0.1, 0.5, 1.0, 2.0],
#             # 'ada__loss': ['linear', 'square', 'exponential'],
#             # 'gb__n_estimators': [10, 20, 50, 100],
#             # 'gb__criterion': ['friedman_mse', 'squared_error'],
#             # 'gb__max_features': ['sqrt', None],
#             # 'knn__n_neighbors': [2, 3, 5, 7, 10],
#             # 'knn__leaf_size': [1, 2, 3, 5],
#             # 'knn__weights': ['uniform', 'distance'],
#             # 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#             # 'rnn__radius': [1000, 5, 2, 1, 0.5],
#             # 'rnn__weights': [
#             #     # 'uniform',
#             #     'distance'
#             #     ],
#             # 'rnn__algorithm': [
#             #     'auto',
#             #     # 'ball_tree',
#             #     # 'kd_tree',
#             #     # 'brute'
#             #     ],
#             # 'rnn__leaf_size': [5, 2, 1],
#             # 'nb_pipe__nb__prior': None,  # if using NB choose priors first!
#         }
#     ),
# }


In [None]:
scoring = ['neg_mean_absolute_error']  # possibilities: ‘neg_root_mean_squared_error’, ‘neg_mean_squared_error’, 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_log_error'
    

grid = GridSearchCV(  # TODO: Possible to set random state for all estimators?
    pipe,
    params,
    scoring= scoring,
    error_score=np.nan,
    refit= scoring[0],
    cv=LeaveOneOut(),
    verbose=1,
    n_jobs=-1
    )

grid.fit(model_X, model_y)


In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# grid.best_estimator_.get_feature_names_out()

In [None]:
# scores = pd.DataFrame(grid.cv_results_)
# scores

In [None]:
self_pred_y = grid.predict(model_X)  # use the best model to predict the data on the same samples that were used to train the model

In [None]:
r2_score(model_y, self_pred_y)

In [None]:
df = pd.concat([
    model_data.loc[model_y.index].regio_sep,
    model_y,
    pd.Series(
        self_pred_y,
        name='Prediction',
        index=model_y.index)
        ],
    axis=1
    ).reset_index()

In [None]:

scatter_chart(df, target, 'Prediction', color='regio_sep', labels='Sample', identity=True, equal_axes=False)[0]