In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, LeavePOut

from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler

from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import SVR

import prepare_data
from components import PCOA
from helpers import PipelineHelper, SMWrapper
from settings import Config, shortnames, target

## Data preparation

In [2]:
# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()

Removed 0 samples with less than 0 particles.
Series([], dtype: float64)
PCoA: Proportion explained: 
 PC1    0.796518
PC2    0.105473
dtype: float64    PCoA Total: 0.901991651784579
PCoA: Proportion explained: 
 PC1    0.676644
PC2    0.153534
dtype: float64    PCoA Total: 0.8301779851270285


  mp_pdd.columns = mp_pdd.columns.str.replace("[\[( )\]]", "")  # remove brackets from column names
  warn(
  warn(


In [3]:
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')
model_data = sdd_iow.loc[~sdd_iow.Concentration.isna()].set_index('Sample')
pred_data = sdd_iow.loc[sdd_iow.Concentration.isna()]
pred_data = pd.concat([pred_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

In [4]:
featurelist = [
    'Depth',
    'LON', 'LAT',
    # 'Dist_Marina', 'Dist_WWTP', 'Dist_WWTP2',
    'MODE 1 (µm)',
    'D10 (µm)', 'D50 (µm)', 'D90 (µm)',
    'perc GRAVEL', 'perc SAND', 'perc MUD', 'perc CLAY',
    #'OM_D50', 'TOC', 'Hg', 'TIC', 'regio_sep',
    'PC1', 'PC2'
    ]
model_X = model_data[featurelist]
model_y = model_data[target]
pred_X = pred_data[featurelist]

## Model building

In [5]:

pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler()),
        ('minmax', MinMaxScaler()),
    ], optional=True)),

    ('classifier', PipelineHelper([
        # ('glm', SMWrapper(family=Config.glm_family, formula=Config.glm_formula)),
        ('svm', SVR()),
        ('rf', RandomForestRegressor()),
        ('ada', AdaBoostRegressor()),
        ('gb', GradientBoostingRegressor()),
        ('knn', KNeighborsRegressor()),
        ('rnn', RadiusNeighborsRegressor()),
        ('nb_pipe', Pipeline([
            # Naive Bayes needs positive numbers
            ('scaler', MinMaxScaler()),
            ('nb', GaussianNB()),
        ])),
    ])),
])

params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate(
        {
            'std__with_mean': [True, False],
            'std__with_std': [True, False],
            # no params for 'max' and 'minmax' leads to using standard params
        }
    ),
    'classifier__selected_model': pipe.named_steps['classifier'].generate(
        {
            # 'glm__alpha': [0.0, 0.1, 0.2, 0.5, 1.0],
            # 'glm__L1_wt': [0.1, 0.5, 1],
            'svm__C': [0.1, 0.5, 1.0],
            'svm__kernel': ['linear', 'rbf', 'poly'],
            'svm__degree': [1, 2, 3, 4, 5],
            'rf__n_estimators': [10, 20, 50, 100, 150],
            'rf__max_features': ['sqrt', 'log2', None],
            'rf__min_samples_split': [2, 5, 10],
            'rf__min_samples_leaf': [1, 2, 4],
            'rf__bootstrap': [True, False],
            'rf__max_depth': [None, 2, 5, 10],
            'rf__warm_start': [True, False],
            'ada__n_estimators': [10, 20, 40, 100],
            'ada__learning_rate': [0.1, 0.5, 1.0, 2.0],
            'ada__loss': ['linear', 'square', 'exponential'],
            'gb__n_estimators': [10, 20, 50, 100],
            'gb__criterion': ['friedman_mse', 'squared_error'],
            'gb__max_features': ['sqrt', None],
            'knn__n_neighbors': [2, 3, 5, 7, 10],
            'knn__leaf_size': [1, 2, 3, 5],
            'knn__weights': ['uniform', 'distance'],
            'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'rnn__radius': [0.1, 0.5, 1, 2, 5, 10],
            'rnn__weights': ['uniform', 'distance'],
            'rnn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'rnn__leaf_size': [1, 2, 3, 5],
            # 'nb_pipe__nb__prior': None,
        }
    ),
}


In [7]:
grid = GridSearchCV(  # TODO: Possible to set random state for all estimators?
    pipe,
    params,
    scoring='r2',  # possibilities: ‘neg_root_mean_squared_error’, ‘neg_mean_squared_error’, 'r2', 'neg_mean_absolute_error', 'neg_mean_squared_log_error'
    cv=LeavePOut(2),
    verbose=1,
    n_jobs=-1
    )

grid.fit(model_X, model_y)


Fitting 465 folds for each of 18354 candidates, totalling 8534610 fits


In [None]:
print(grid.best_params_)
print(grid.best_score_)