In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import matplotlib.pyplot as plt 
import seaborn as sns

import altair as alt
alt.data_transformers.disable_max_rows()
import altair_transform

from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut

In [2]:
#import quality checked and blank substracted list of MP from Micropoll_SchleiSediment_blank_subtract.ipynb
env_MP = pd.read_csv('../csv/env_MP_clean_list_SchleiSediments.csv',index_col=0)
#rename column name of Size_1
env_MP.rename(columns = {'Size_1_[µm]':'Size_1_µm'}, inplace = True)
#env_MP_a500 = env_MP.loc[env_MP.size_geom_mean >= 500]
#env_MP_b500 = env_MP.loc[env_MP.size_geom_mean < 500]

In [3]:
mp_station = env_MP.groupby(['Sample']).agg(
        Frequency=('Site_name', 'count'),  # using 'Site_name' here for count, could use any other column too... Is there a way to count entries in groups without using a column?
        FrequencyA500=('size_geom_mean', lambda x: (x>=500).sum()),  # using 'Site_name' here for count, could use any other column too... Is there a way to count entries in groups without using a column?
        FrequencyB500=('size_geom_mean', lambda x: (x<500).sum()),  # using 'Site_name' here for count, could use any other column too... Is there a way to count entries in groups without using a column?
        Mass=('Sampling_weight_[kg]', np.mean),  # using "mean" here is actually weird as all entries are the same. Is there something like "first"?
        GPS_LONs = ('GPS_LON', np.mean),
        GPS_LATs = ('GPS_LAT', np.mean),
        Split = ('Fraction_analysed', np.mean),
        MP_D50 = ('size_geom_mean',np.median)
        ##MP_D50_A500 = ('size_geom_mean' >= 500.median()),
        #MP_D50_B500 = ('size_geom_mean', lambda x: (x<500).median())
 ).reset_index()

mp_station['Concentration'] =  round(mp_station['Frequency']/ (mp_station['Mass'] * mp_station['Split']))
mp_station['ConcentrationA500'] =  round(mp_station['FrequencyA500']/ (mp_station['Mass'] * mp_station['Split']))
mp_station['ConcentrationB500'] =  round(mp_station['FrequencyB500']/ (mp_station['Mass'] * mp_station['Split']))
mp_station.head(1)

Unnamed: 0,Sample,Frequency,FrequencyA500,FrequencyB500,Mass,GPS_LONs,GPS_LATs,Split,MP_D50,Concentration,ConcentrationA500,ConcentrationB500
0,Schlei_S10,297,4.0,293.0,0.25,9.60355,54.5253,1.0,85.732141,1188.0,16.0,1172.0


In [4]:
mp_poly_station = env_MP.groupby(['Sample','polymer_type']).agg(
        Frequency=('Site_name', 'count'),  # using 'Site_name' here for count, could use any other column too... Is there a way to count entries in groups without using a column?
        Mass=('Sampling_weight_[kg]', np.mean),  # using "mean" here is actually weird as all entries are the same. Is there something like "first"?
        GPS_LONs = ('GPS_LON', np.mean),
        GPS_LATs = ('GPS_LAT', np.mean),
        Split = ('Fraction_analysed', np.mean)
 ).reset_index()

mp_poly_station['Concentration'] =  round(mp_poly_station['Frequency']/ (mp_poly_station['Mass'] * mp_poly_station['Split']))

mp_poly_station.head(1)#.shape


Unnamed: 0,Sample,polymer_type,Frequency,Mass,GPS_LONs,GPS_LATs,Split,Concentration
0,Schlei_S10,Poly (ethylene terephthalate),193,0.25,9.60355,54.5253,1.0,772.0


In [5]:
#make plot for rel. Polymer type distribution per station

Sample_order = ['Schlei_S1_15cm','Schlei_S2','Schlei_S3','Schlei_S5','Schlei_S8', 'Schlei_S10','Schlei_S10_15cm', 'Schlei_S11','Schlei_S13', 'Schlei_S14', 'Schlei_S15',
       'Schlei_S17', 'Schlei_S19', 'Schlei_S22', 'Schlei_S23', 'Schlei_S24', 'Schlei_S25','Schlei_S26','Schlei_S27', 'Schlei_S30', 'Schlei_S31']

selection = alt.selection_multi(fields=['polymer_type'], bind='legend')

Poly_Dist = alt.Chart(mp_poly_station).mark_bar().encode(
    x= alt.X('Sample',sort = Sample_order),#'polymer_type' == 'Polyamide',# , #'polymer_type' == "Polyamide", #df_new = df[df['Pid'] == 'p01']
    y= alt.Y('Concentration'),
    color= 'polymer_type',
    tooltip = ['polymer_type', 'Concentration']
).add_selection(
    selection
).transform_filter(
    selection
#).interactive(
)

Poly_Dist #| Poly_Dist.encode(y=alt.Y('Concentration',stack='normalize'))

## look at specific polymer types

In [7]:
#import d50 values 
sed_d50 = pd.read_csv('../csv/Schlei_Sed_D50_new.csv',index_col=0)
sed_63 = pd.read_csv('../csv/Schlei_Sed_D50_new.csv',index_col=0)

#import organic matter size, TOC, Hg data
sed_OM = pd.read_csv('../csv/Schlei_OM.csv',index_col=0)

#import sampling log data
slogs= pd.read_csv('../csv/Schlei_sed_sampling_log.csv',index_col=0)

Dist_WWTP = pd.read_csv('../csv/Schlei_Sed_Dist_WWTP.csv',index_col=0)

#merge with mp per station
mp_sedStats = pd.merge(mp_station,slogs.reset_index(),on=['Sample'], how='left')
mp_sedStats = pd.merge(mp_sedStats,sed_d50.reset_index(),on=['Sample'], how='left')
mp_sedStats = pd.merge(mp_sedStats,sed_OM.reset_index(),on=['Sample'], how='left')
mp_sedStats = pd.merge(mp_sedStats,Dist_WWTP.reset_index(),on=['Sample'], how='left')

mp_sedStats.to_csv('../csv/MP_Stats_SchleiSediments.csv')
mp_sedStats.head(2)

TypeError: Can only merge Series or DataFrame objects, a <class 'list'> was passed

In [None]:
# dictionary
Regio_Sep =  {'Schlei_S1_15cm': 'inner',
              'Schlei_S2': 'inner',
              'Schlei_S3': 'inner',
              'Schlei_S5': 'river',
              'Schlei_S8': 'inner',
              'Schlei_S10': 'inner',
              'Schlei_S10_15cm': 'inner',
              'Schlei_S11': 'inner',
              'Schlei_S13': 'inner',
              'Schlei_S14': 'outlier',
              'Schlei_S15': 'inner',
              'Schlei_S17': 'inner',
              'Schlei_S19': 'outlier',
              'Schlei_S22': 'outer',
              'Schlei_S23': 'outer',
              'Schlei_S24': 'outer', 
              'Schlei_S25': 'outer',
              'Schlei_S26': 'outer',
              'Schlei_S27': 'outer', 
              'Schlei_S30': 'outer', 
              'Schlei_S31': 'outer'}

mp_sedStats = mp_sedStats.merge(pd.DataFrame.from_dict(Regio_Sep,orient='index',columns=['Regio_Sep']),left_on='Sample',right_index=True)
#mp_sedStats.columns

In [None]:
MP_SED_Plot = alt.Chart(mp_sedStats).mark_point().encode(
    x='D50',
    y= alt.Y('Concentration',scale = alt.Scale(type= 'linear')),
    color= 'Regio_Sep',
    tooltip='Sample'
)

Reg_Line = MP_SED_Plot.transform_regression('D50', 'Concentration',
                                            method="exp",
                                            groupby=["Regio_Sep"],
                                           ).mark_line(color="red")

Reg_Params = MP_SED_Plot.transform_regression('D50', 'Concentration',
                                              method="pow",
                                              groupby=["Regio_Sep"],
                                              params=True                                             
    ).mark_text(align='left', lineBreak='\n').encode(
        x=alt.value(120),  # pixels from left
        y=alt.value(20),  # pixels from top
        text='params:N'
    ).transform_calculate(
        params='"r² = " + round(datum.rSquared * 100)/100 + \
    "      y = " + round(datum.coef[0] * 10)/10 + " + e ^ (" + round(datum.coef[1] * 10000)/10000 + "x" + ")" + \n + " "'
    )

#ParamsDF = altair_transform.extract_data(Reg_Params)

MP_SED_Plot + Reg_Line + Reg_Params

In [None]:
env_MP = env_MP.merge(mp_sedStats[['Sample', 'TOC', 'Regio_Sep']], on='Sample')
env_MP.rename(columns={'TOC': 'TOCs', 'Sampling_weight_[kg]': 'Sampling_weight'}, inplace=True)
env_MP.drop(['Site_name', 'GPS_LON', 'GPS_LAT', 'Compartment',
                      'Contributor', 'Project', 'Size_1_µm', 'Size_2_[µm]', 'Shape', 'Colour',
                      'polymer_type', 'library_entry', 'lab_blank_ID', 'sample_ID'], axis=1, inplace=True)

In [None]:
lower_size_limit = 0
upper_size_limit = 1000

brush = alt.selection(type='interval', encodings=['x'], resolve='global')

PDFs = alt.Chart(
    env_MP,
#    width=100,
#    height=80
).transform_density(
    'size_geom_mean', #Size_1_µm
    extent=[lower_size_limit,upper_size_limit],
    as_=['size','density'],
    groupby=['Sample'],
    steps=200,
    bandwidth=50,
    cumulative=False,
    counts=False
).mark_line().encode(
    x='size:Q',
    y=alt.Y('density:Q', stack=None),
    color='Sample',
    tooltip= ['Sample']
#).facet(
#    'Sample:N',
#    columns=7
).add_selection(
    brush
)

MPconcTOC = alt.Chart(env_MP
).transform_aggregate(
    Frequency = 'count(*)',
    Mass = 'mean(Sampling_weight)',
    Split = 'mean(Fraction_analysed)',
    TOC = 'mean(TOCs)',
    groupby = ['Sample']
).transform_calculate(
    Conc = 'datum.Frequency / (datum.Mass * datum.Split)'
).mark_point().encode(
    x = 'TOC:Q',
    y = 'Conc:Q',
    color='Regio_Sep',
    tooltip= ['Sample']
).transform_filter(
    brush
)

PDFs & MPconcTOC

In [None]:
def optimise_bandwidth(data, kernel, bandwidths=10**np.linspace(0,2,100)):
    grid = GridSearchCV(KernelDensity(kernel=kernel),
                        {'bandwidth': bandwidths},
                        cv=LeaveOneOut())
    grid.fit(data[:, None]);
    bw = grid.best_params_['bandwidth']
    return bw

In [None]:
def calculate_kde(data, x_d=np.linspace(0,999,1000), optimise_bw=True , kernel='gaussian'):  # data should be 1D np-array, x_d is the discrete values where the probability density is evaluated, bw is the bandwidth to be used for the kernels
    
    bw = optimise_bandwidth(data, kernel) if optimise_bw else 50
    
    # instantiate and fit the KDE model
    kde = KernelDensity(bandwidth=bw, kernel=kernel)
    kde.fit(data[:, None])
    # score_samples returns the log of the probability density
    logprob = kde.score_samples(x_d[:, None])
    kde_result = np.exp(logprob)
    
    return kde_result, bw

#plt.fill_between(x_d, kde_result, alpha=0.5)
#plt.plot(x, np.full_like(x, -0.001), '|k', markeredgewidth=1)
#plt.xlim(0, 1000)

In [None]:
#x = env_MP.loc[env_MP.Sample == 'Schlei_S10', 'size_geom_mean'].values

kde_results = pd.DataFrame({'x_d': np.linspace(0,999,1000)})

for SampleName, SampleGroup in env_MP.groupby(['Sample']):
    x = SampleGroup.size_geom_mean.values
    kde_result, bw = calculate_kde(x, optimise_bw=False)
    
    kde_results[SampleName] = kde_result
    
    print(f'{SampleName}:    bandwidth is {round(bw,2)}')

In [None]:
alt.Chart(kde_results.melt(id_vars=['x_d'])).mark_line().encode(
    x='x_d',
    y='value',
    color='variable',
    tooltip='variable'
).interactive()

In [None]:
slider_low = alt.binding_range(min=0, max=20000, step=10)
select_low = alt.selection_single(name="lower_size", fields=['size_geom_mean'],
                                  bind=slider_low, init={'size_geom_mean': 0})
slider_up = alt.binding_range(min=0, max=20000, step=10)
select_up = alt.selection_single(name="upper_size", fields=['size_geom_mean'],
                                  bind=slider_up, init={'size_geom_mean': 20000})

MPconcTOC = alt.Chart(env_MP
).transform_aggregate(
    Frequency = 'count(*)',
    Mass = 'mean(Sampling_weight)',
    Split = 'mean(Fraction_analysed)',
    TOC = 'mean(TOCs)',
    groupby = ['Sample']
).transform_calculate(
    Conc = 'datum.Frequency / (datum.Mass * datum.Split)'
).mark_point().encode(
    x = 'TOC:Q',
    y = 'Conc:Q',
    color='Regio_Sep'
).add_selection(
    select_low
).add_selection(
    select_up
).transform_filter(
    'datum.size_geom_mean >= lower_size_size_geom_mean'
).transform_filter(
    'datum.size_geom_mean <= upper_size_size_geom_mean'
)

MPconcTOC

In [None]:
import altair as alt
from vega_datasets import data
source = data.iris()

#brush = alt.selection(type='interval', encodings=['x'])
slider_low = alt.binding_range(min=0, max=10, step=0.1)
select_low = alt.selection_single(name="lowerSize", fields=['sepalWidth'],
                                  bind=slider_low, init={'sepalWidth': 0})
slider_up = alt.binding_range(min=0, max=10, step=0.1)
select_up = alt.selection_single(name="upperSize", fields=['sepalWidth'],
                                  bind=slider_up, init={'sepalWidth': 10})


PDFs = alt.Chart(source
).transform_density(
    'sepalWidth',
    as_=['size','density'],
    groupby=['species']
).mark_line().encode(
    x='size:Q',
    y='density:Q',
    color='species'
#).add_selection(
#    brush
)


Scatter = alt.Chart(source
).transform_aggregate(
    Frequency = 'count()',
    petalL_mean = 'mean(petalLength)',
    petalW_mean = 'mean(petalWidth)',
    sepalL_mean = 'mean(sepalLength)',
    groupby = ['species']
).transform_calculate(
    Value = 'datum.Frequency / (datum.petalL_mean * datum.petalW_mean)'
).mark_point().encode(
    x = 'sepalWidth:Q',
    y = 'sepalWidth:Q',
    color='species'
).add_selection(
    select_low
).add_selection(
    select_up
).transform_filter(
    'datum.sepalWidth >= lowerSize_sepalWidth'
).transform_filter(
    'datum.sepalWidth <= upperSize_sepalWidth'
)

PDFs | Scatter