In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import matplotlib.pyplot as plt 
import seaborn as sns

import altair as alt
alt.data_transformers.disable_max_rows()
import altair_transform

from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut

In [None]:
#import quality checked and blank substracted list of MP from Micropoll_SchleiSediment_blank_subtract.ipynb
env_MP = pd.read_csv('env_MP_clean_list_SchleiSediments.csv',index_col=0)
#rename column name of Size_1
env_MP.rename(columns = {'Size_1_[µm]':'Size_1_µm'}, inplace = True)
#env_MP_a500 = env_MP.loc[env_MP.size_geom_mean >= 500]
#env_MP_b500 = env_MP.loc[env_MP.size_geom_mean < 500]

In [None]:
groupy_env_MP = env_MP.groupby(['Sample'])

mp_station = groupy_env_MP.agg(
        Frequency=('Site_name', 'count'),  # using 'Site_name' here for count, could use any other column too... Is there a way to count entries in groups without using a column?
        FrequencyA500=('size_geom_mean', lambda x: (x>=500).sum()),  # using 'Site_name' here for count, could use any other column too... Is there a way to count entries in groups without using a column?
        FrequencyB500=('size_geom_mean', lambda x: (x<500).sum()),  # using 'Site_name' here for count, could use any other column too... Is there a way to count entries in groups without using a column?
        Mass=('Sampling_weight_[kg]', np.mean),  # using "mean" here is actually weird as all entries are the same. Is there something like "first"?
        GPS_LONs = ('GPS_LON', np.mean),
        GPS_LATs = ('GPS_LAT', np.mean),
        Split = ('Fraction_analysed', np.mean),
        MP_D50 = ('size_geom_mean',np.median)
        ##MP_D50_A500 = ('size_geom_mean' >= 500.median()),
        #MP_D50_B500 = ('size_geom_mean', lambda x: (x<500).median())
 ).reset_index()

mp_station['Concentration'] =  round(mp_station['Frequency']/ (mp_station['Mass'] * mp_station['Split']))
mp_station['ConcentrationA500'] =  round(mp_station['FrequencyA500']/ (mp_station['Mass'] * mp_station['Split']))
mp_station['ConcentrationB500'] =  round(mp_station['FrequencyB500']/ (mp_station['Mass'] * mp_station['Split']))
mp_station.head(1)

In [None]:
#import d50 values 
sed_d50 = pd.read_csv('Schlei_Sed_D50_new.csv',index_col=0)
sed_63 = pd.read_csv('Schlei_Sed_D50_new.csv',index_col=0)

#import ogranic matter size, TOC, Hg data
sed_OM = pd.read_csv('Schlei_OM.csv',index_col=0)

#import sampling log data
slogs= pd.read_csv('Schlei_sed_sampling_log.csv',index_col=0)

Dist_WWTP = pd.read_csv('Schlei_Sed_Dist_WWTP.csv',index_col=0)

#merge with mp per station
mp_sedStats = pd.merge(mp_station,slogs.reset_index(),on=['Sample'], how='left')
mp_sedStats = pd.merge(mp_sedStats,sed_d50.reset_index(),on=['Sample'], how='left')
mp_sedStats = pd.merge(mp_sedStats,sed_OM.reset_index(),on=['Sample'], how='left')
mp_sedStats = pd.merge(mp_sedStats,Dist_WWTP.reset_index(),on=['Sample'], how='left')

mp_sedStats.to_csv('MP_Stats_SchleiSediments.csv')
mp_sedStats.head(2)

In [None]:
# dictionary
Regio_Sep =  {'Schlei_S1_15cm': 'inner',
              'Schlei_S2': 'inner',
              'Schlei_S3': 'inner',
              'Schlei_S5': 'river',
              'Schlei_S8': 'inner',
              'Schlei_S10': 'inner',
              'Schlei_S10_15cm': 'inner',
              'Schlei_S11': 'inner',
              'Schlei_S13': 'inner',
              'Schlei_S14': 'outlier',
              'Schlei_S15': 'inner',
              'Schlei_S17': 'inner',
              'Schlei_S19': 'outlier',
              'Schlei_S22': 'outer',
              'Schlei_S23': 'outer',
              'Schlei_S24': 'outer', 
              'Schlei_S25': 'outer',
              'Schlei_S26': 'outer',
              'Schlei_S27': 'outer', 
              'Schlei_S30': 'outer', 
              'Schlei_S31': 'outer'}

mp_sedStats = mp_sedStats.merge(pd.DataFrame.from_dict(Regio_Sep,orient='index',columns=['Regio_Sep']),left_on='Sample',right_index=True)
#mp_sedStats.columns

In [None]:
env_MP = env_MP.merge(mp_sedStats[['Sample', 'TOC', 'Regio_Sep']], on='Sample')
env_MP.rename(columns={'TOC': 'TOCs', 'Sampling_weight_[kg]': 'Sampling_weight'}, inplace=True)
env_MP.drop(['Site_name', 'GPS_LON', 'GPS_LAT', 'Compartment',
                      'Contributor', 'Project', 'Size_1_µm', 'Size_2_[µm]', 'Shape', 'Colour',
                      'polymer_type', 'library_entry', 'lab_blank_ID', 'sample_ID'], axis=1, inplace=True)

In [None]:
def optimise_bandwidth(data, kernel, bandwidths=10**np.linspace(0,2,100)):
    grid = GridSearchCV(KernelDensity(kernel='gaussian'),
                        {'bandwidth': bandwidths},
                        cv=LeaveOneOut())
    grid.fit(data[:, None]);
    bw = grid.best_params_['bandwidth']
    return bw

In [None]:
def calculate_kde(data, x_d=np.linspace(0,999,1000), optimise_bw=True , kernel='gaussian'):  # data should be 1D np-array, x_d is the discrete values where the probability density is evaluated, bw is the bandwidth to be used for the kernels
    
    bw = optimise_bandwidth(data, kernel) if optimise_bw else 50
    
    # instantiate and fit the KDE model
    kde = KernelDensity(bandwidth=bw, kernel=kernel)
    kde.fit(data[:, None])
    # score_samples returns the log of the probability density
    logprob = kde.score_samples(x_d[:, None])
    kde_result = np.exp(logprob)
    
    return kde_result, bw

#plt.fill_between(x_d, kde_result, alpha=0.5)
#plt.plot(x, np.full_like(x, -0.001), '|k', markeredgewidth=1)
#plt.xlim(0, 1000)

In [None]:
#x = env_MP.loc[env_MP.Sample == 'Schlei_S10', 'size_geom_mean'].values

SampleGroups = env_MP.groupby(['Sample'])

kde_results = pd.DataFrame({'x_d': np.linspace(0,999,1000)})

for SampleName, SampleGroup in SampleGroups:
    x = SampleGroup.size_geom_mean.values
    kde_result, bw = calculate_kde(x, optimise_bw=False)
    
    kde_results[SampleName] = kde_result
    
    print(f'{SampleName}:    bandwidth is {round(bw,2)}')

In [None]:
alt.Chart(kde_results.melt(id_vars=['x_d'])).mark_line().encode(
    x='x_d',
    y='value',
    color='variable',
    tooltip='variable'
).interactive()