In [1]:
%reload_ext autoreload
%autoreload 2

# import warnings
# warnings.filterwarnings('ignore')  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for single cpu)
import os
os.environ["PYTHONWARNINGS"] = "ignore"  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for parallel)

from datetime import datetime
import numpy as np
import pandas as pd
import altair as alt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeavePOut, cross_val_score
from sklearn.linear_model import TweedieRegressor
from sklearn.metrics import r2_score

import prepare_data
from components import PCOA
from settings import Config, shortnames, target
from plots import scatter_chart

## Data preparation

In [2]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(prepare_data.fix_gradistat_names(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0)), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()

## Split data into samples used for building the model and samples used for predicting.
model_data = sdd_iow.loc[~sdd_iow[target].isna()].set_index('Sample')
pred_data = sdd_iow.loc[sdd_iow[target].isna()]
pred_data = pd.concat([pred_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

## Potential outlier exclusion
droplist = []#['S32','S05']
model_data = model_data.drop(droplist)

In [3]:
## List of features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.

featurelist = [
    'Depth',
    # 'Dist_Land',
    # 'Dist_WWTP',
    # 'SED_D50',
    # 'perc_MUD',
    # 'TOC',
    'PC1',
    #'PC2',
    'WWTP_influence_as_mean_time_travelled__sed_18µm_allseasons_444'
    ]
model_X = model_data[featurelist]
model_y = model_data[target]
pred_X = pred_data[featurelist]

## Check some basic statistics of the target variable
model_y.describe()
# model_y.hist()

count       28.000000
mean      3965.892857
std       5682.367291
min         48.000000
25%        554.000000
50%       1771.000000
75%       5246.000000
max      27590.000000
Name: Concentration, dtype: float64

In [4]:
# Scale data using StandardScaler

# scaler = StandardScaler()
# model_X_scaled = scaler.fit_transform(model_X)
# pred_X_scaled = scaler.transform(pred_X)

In [5]:
model_X = model_X[['PC1', 'Depth', 'WWTP_influence_as_mean_time_travelled__sed_18µm_allseasons_444']]
model = TweedieRegressor(alpha=0.1, link='log', max_iter=100000, power=1.5, tol=0.0001)

In [6]:
# for loop to run cross validation on the final model with leave-p-out iterating p from 2 to 10
CV_results = pd.DataFrame()
for P in range(2, 7):
    starttime = datetime.now()
    R2s = cross_val_score(model, model_X, model_y, cv=LeavePOut(P), scoring='r2', n_jobs=-1, verbose=0)
    duration = datetime.now() - starttime
    print(f'Cross-validation of final model with leave-{P}-out took {duration.seconds//3600} hours, {(duration.seconds//60)%60} minutes and {duration.seconds%60} seconds.')
    print(f'Cross-validated MEAN R2 of final model: {R2s.mean():.3f} (Standard deviation: {R2s.std():.3f})')
    print(f'Cross-validated MEDIAN R2 of final model: {np.median(R2s):.3f} (IQR {np.subtract(*np.percentile(R2s, [.75, .25])):.3f})')
    print()
    mean_result = pd.DataFrame({'P': P, 'R2': R2s.mean(), 'aggType': 'mean', 'Dispersion': R2s.std(), 'dispersionType': 'std', 'time-to-compute': duration.seconds}, index=[0])
    median_result = pd.DataFrame({'P': P, 'R2': np.median(R2s), 'aggType': 'median', 'Dispersion': np.subtract(*np.percentile(R2s, [.75, .25])), 'dispersionType': 'IQR', 'time-to-compute': duration.seconds}, index=[0])
    CV_results = pd.concat([CV_results, mean_result, median_result], ignore_index=True)

CV_results['relative_P'] = CV_results['P']/len(model_X)

Cross-validation of final model with leave-2-out took 0 hours, 0 minutes and 5 seconds.
Cross-validated MEAN R2 of final model: -3272.127 (Standard deviation: 35998.622)
Cross-validated MEDIAN R2 of final model: -0.032 (IQR 88240.953)

Cross-validation of final model with leave-3-out took 0 hours, 0 minutes and 22 seconds.
Cross-validated MEAN R2 of final model: -54.287 (Standard deviation: 2118.546)
Cross-validated MEDIAN R2 of final model: 0.299 (IQR 502.806)

Cross-validation of final model with leave-4-out took 0 hours, 2 minutes and 3 seconds.
Cross-validated MEAN R2 of final model: -2.100 (Standard deviation: 30.025)
Cross-validated MEDIAN R2 of final model: 0.385 (IQR 38.337)

Cross-validation of final model with leave-5-out took 0 hours, 9 minutes and 6 seconds.
Cross-validated MEAN R2 of final model: -0.518 (Standard deviation: 7.031)
Cross-validated MEDIAN R2 of final model: 0.415 (IQR 16.399)

Cross-validation of final model with leave-6-out took 0 hours, 38 minutes and 44 s

In [7]:
CV_results

Unnamed: 0,P,R2,aggType,Dispersion,dispersionType,time-to-compute,relative_P
0,2,-3272.127001,mean,35998.622446,std,5,0.071429
1,2,-0.032123,median,88240.952522,IQR,5,0.071429
2,3,-54.287475,mean,2118.546063,std,22,0.107143
3,3,0.299359,median,502.805673,IQR,22,0.107143
4,4,-2.099896,mean,30.024508,std,123,0.142857
5,4,0.385495,median,38.337219,IQR,123,0.142857
6,5,-0.518015,mean,7.030621,std,546,0.178571
7,5,0.415304,median,16.399213,IQR,546,0.178571
8,6,-0.064919,mean,3.218008,std,2324,0.214286
9,6,0.42941,median,14.07115,IQR,2324,0.214286


In [9]:
# Plot line chart of cross-validated R2s (one line as 'median', another as 'mean') with dispersion as error bands (standard deviation or IQR, respectively). For the x-axis, use the relative P in percent.
CV_results['upper'] = CV_results['R2'] + 0.5 * CV_results['Dispersion']
CV_results['lower'] = CV_results['R2'] - 0.5 * CV_results['Dispersion']

lines = alt.Chart(CV_results).mark_line().encode(
    x=alt.X('relative_P:Q', title='P of leave-P-out (% of samples)', axis=alt.Axis(format='%'), scale=alt.Scale(domain=[0, 1])),
    y=alt.Y('R2:Q', title='R2', scale=alt.Scale(domain=[-1, 1])),
    color=alt.Color('aggType:N', title='Aggregation type'),
    tooltip=['P', 'R2', 'time-to-compute']
).properties(
    width=600,
    height=400
).interactive()


# use the column 'Dispersion' as the y-error value for the error band
bands = alt.Chart(CV_results).mark_area(
    opacity=0.3
).encode(
    x=alt.X('relative_P:Q', title=None),
    y=alt.Y('upper:Q', title=None),
    y2=alt.Y2('lower:Q', title=None),
    color=alt.Color('dispersionType:N', title='Dispersion type')
)

lines + bands
    