In [None]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
pd.options.plotting.backend = 'holoviews'

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import TweedieRegressor
from sklearn.metrics import r2_score

import prepare_data
from components import PCOA
from settings import Config, shortnames, target
from plots import scatter_chart

## Data preparation

In [None]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()

In [None]:
## Additional variable generation (e.g. predictor derivatives)
 
# sdd_iow['Dist_WWTP_revsq'] = ((1/sdd_iow['Dist_WWTP'])**3)*10000000000  # calculates the squared of the reversed Distance 
# sdd_iow['Dist_WWTP_revsq'] = (((sdd_iow['Dist_WWTP'].max()-sdd_iow['Dist_WWTP'])+1)**3)/100000000000  # calculates the squared of the reversed Distance 
# sdd_iow['Dist_WWTP_revsq'] = ((sdd_iow['Dist_WWTP'].max()/sdd_iow['Dist_WWTP'])**3)/100  # calculates the squared of the reversed Distance
# sdd_iow['Dist_WWTP_revsq'] = ((sdd_iow['Dist_WWTP'].max()/sdd_iow['Dist_WWTP'])**2)  # calculates the squared of the reversed Distance

# sdd_iow

In [None]:
## Split data into samples used for building the model and samples used for predicting.

model_data = sdd_iow.loc[~sdd_iow.Concentration.isna()].set_index('Sample')
pred_data = sdd_iow.loc[sdd_iow.Concentration.isna()]
pred_data = pd.concat([pred_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

In [None]:
## Potential outlier exclusion
model_data = model_data.drop(['S10d','S32'])

In [None]:
## List of features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.

featurelist = [
    # 'Depth',
    # 'LON', 'LAT',
    # 'Dist_Land',
    # 'Dist_Marina',
    'Dist_WWTP',
    # 'WWTP_influence_as_tracer_mean_dist',
    # 'WWTP_influence_as_cumulated_residence',
    # 'WWTP_influence_as_mean_time_travelled',
    # 'Dist_WWTP2',
    #'Dist_WWTP_revsq',
    # 'MODE 1 (µm)',
    # 'D10 (µm)',
    # 'D50 (µm)',
    # 'D90 (µm)',
    # 'perc GRAVEL',
    # 'perc SAND',
    #'perc MUD',
    # 'perc CLAY',
    # 'OM_D50',
    # 'TOC',
    # 'Hg',
    # 'TIC',
    # 'regio_sep',
    'PC1',
    #'PC2'
    ]
model_X = model_data[featurelist]
model_y = model_data[target]
pred_X = pred_data[featurelist]

## Check some basic statistics of the target variable
model_y.describe()
# model_y.hist()

In [None]:
# Scale data using StandardScaler

scaler = StandardScaler()
model_X_scaled = scaler.fit_transform(model_X)
pred_X_scaled = scaler.transform(pred_X)

In [None]:
# build a GLM model with Tweedie distribution

model = TweedieRegressor(
    power=0,
    # alpha=5,
    link='log',
    # fit_intercept=False,
    # warm_start=False,
    max_iter=1000,
    tol=1e-8,
    verbose=1,
    )
model.fit(model_X, model_y)#, fit_params={'sample_weight': model_data.loc[model_X.index, 'Mass'].to_numpy()})

In [None]:
model.feature_names_in_

In [None]:
model_y_pred = model.predict(model_X_scaled)
R2 = r2_score(model_y, model_y_pred)  # adjusted R² = 1 - (1 - R²) * (n - 1) / (n - p) with n = number of samples, p = number of features
print(f'Training-R2: {R2:.3f}')
n, p = model_X.shape
adjR2 = 1 - (1 - R2) * (n - 1) / (n - p)
print(f'adjusted Training-R2: {adjR2:.3f}')

In [None]:
# Test different powers of Tweedie distribution

power_list = []
R2_list = []
R2adj_list = []
for i in np.linspace(-300,300,10000):
    power_list.append(i)
    model = TweedieRegressor(power=i, alpha=0.5, link='log', max_iter=10000)
    model.fit(model_X_scaled, model_y)
    model_y_pred = model.predict(model_X_scaled)
    # model_y_pred = np.where(model_y_pred < 0, 0, model_y_pred)
    R2 = r2_score(model_y, model_y_pred)
    R2_list.append(R2)
    n, p = model_X.shape
    adjR2 = 1 - (1 - R2) * (n - 1) / (n - p)
    R2adj_list.append(adjR2)

In [None]:
#get the index of where R2_list and R2adj_list is maximum
# R2_list.index(max(R2_list))
power_list[R2adj_list.index(max(R2adj_list))]

In [None]:
pd.DataFrame([power_list, R2_list], index=['power', 'R2']).T.plot(x='power', y='R2', kind='line')

In [None]:
# ## Model fitting

# # Define model
# model = TweedieRegressor(power=1.5, alpha=0.5, link='log', max_iter=10000, tol=1e-6)

# # Define grid search parameters

# param_grid = {
#     'alpha': [0.1, 0.5, 0.9],
#     'power': [1.1, 1.5, 1.9],
#     'link': ['log', 'identity'],
#     'max_iter': [10000],
#     'tol': [1e-6]
# }

# # Define grid search
# grid = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)

# # Fit model
# grid.fit(model_X_scaled, model_y)

# # Print best parameters
# print(grid.best_params_)
# print(grid.best_score_)
# print(grid.best_estimator_)
# print(grid.best_index_)
# print(grid.cv_results_)
# print(grid.scorer_)
# print(grid.refit)

# # Save model
# joblib.dump(grid.best_estimator_, '../models/MP_model.pkl')

# # Load model
# model = joblib.load('../models/MP_model.pkl')

# ## Model evaluation

# # Predictions
# model_y_pred = model.predict(model_X_scaled)
# pred_y_pred = model.predict(pred_X_scaled)

# # Calculate R2
# r2 = r2_score(model_y, model_y_pred)
# print(r2)

# # Calculate RMSE
# rmse = np.sqrt(mean_squared_error(model_y, model_y_pred))
# print(rmse)

# # Calculate MAE
# mae = mean_absolute_error(model_y, model_y_pred)
# print(mae)

# # Calculate MAPE
# mape = np.mean(np.abs((model_y - model_y_pred) / model_y)) * 100
# print(mape)

# # Calculate NSE
# nse = 1 - np.sum((model_y - model_y_pred)**2) / np.sum((model_y - np.mean(model_y))**2)
# print(nse)