In [1]:
import pandas as pd
import altair as alt
import KDE_prepare_data
import KDE_utils
import correlations
from KDE_settings import Config
from scipy import stats
alt.renderers.enable('altair_viewer')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## MP particle size correlations

*This calculates probability density distributions for MP sizes. These are used to estimate the abundances of MP of specific size ranges, wich can then be investigated for correlations to environmental paramteres such as TOC or sediment grain size properties.*
**Before running, check the computational parameters in KDE_settings.py**

In [2]:
pdd_MP = pd.read_csv('../csv/env_MP_clean_list_SchleiSediments.csv', index_col=0)


In [3]:
sdd_MP = KDE_prepare_data.aggregate_SDD(pdd_MP)
sdd_MP_sed = KDE_prepare_data.add_sediment(sdd_MP)
pdd_sdd_MP = KDE_prepare_data.sdd2pdd(sdd_MP_sed, pdd_MP)

In [4]:
size_pdfs = KDE_utils.per_sample_kde(pdd_sdd_MP)

Schlei_S10:    bandwidth is 50
Schlei_S10_15cm:    bandwidth is 50
Schlei_S11:    bandwidth is 50
Schlei_S13:    bandwidth is 50
Schlei_S14:    bandwidth is 50
Schlei_S15:    bandwidth is 50
Schlei_S17:    bandwidth is 50
Schlei_S19:    bandwidth is 50
Schlei_S1_15cm:    bandwidth is 50
Schlei_S2:    bandwidth is 50
Schlei_S22:    bandwidth is 50
Schlei_S23:    bandwidth is 50
Schlei_S24:    bandwidth is 50
Schlei_S25:    bandwidth is 50
Schlei_S26:    bandwidth is 50
Schlei_S27:    bandwidth is 50
Schlei_S3:    bandwidth is 50
Schlei_S30:    bandwidth is 50
Schlei_S31:    bandwidth is 50
Schlei_S5:    bandwidth is 50
Schlei_S8:    bandwidth is 50


In [12]:
bestLower, bestUpper, df_r = correlations.range_conc_correlation(size_pdfs, sdd_MP_sed)

In [12]:
#####USED for testing external function inside NB: #####
step = (Config.upper_size_limit - Config.lower_size_limit) / Config.kde_steps
df_r = pd.DataFrame(columns=['lower_size', 'upper_size', 'r', 'p'])

for i in [10]:
    for j in [1000]:
        size_sum = size_pdfs.loc[(size_pdfs.x_d >= i) & (size_pdfs.x_d < j)].sum()
        size_sum.drop('x_d', inplace=True)
        range_prob = size_sum * step
        range_conc = range_prob * sdd_MP_sed.set_index('Sample').Concentration

        r = stats.pearsonr(range_conc, sdd_MP_sed.set_index('Sample').TOC)
        df_r.loc[len(df_r)] = [i, j, r[0], r[1]]
        print(f'Correlating TOC with size range            [{i},        {j}]                ', end="\r", flush=True)

print(df_r.loc[df_r.r == df_r.r.max()])
bestLower, bestUpper = df_r.loc[df_r.r == df_r.r.max()].iloc[0, 0:2]

   lower_size  upper_size         r         p0,        1000]                
0        10.0      1000.0  0.535529  0.012352


In [None]:
alt.Chart(size_pdfs.melt(id_vars=['x_d'])).mark_line().encode(
    x='x_d',
    y='value',
    color='variable',
    tooltip='variable'
).interactive()

In [20]:
kd = alt.Chart(pd.DataFrame([range_conc, sdd_MP_sed.set_index('Sample').TOC]).T.reset_index()).mark_point().encode(
    x='TOC',
    y='Unnamed 0',
    color='index',
    tooltip='index'
)

Reg_Line = kd.transform_regression('TOC', 'Unnamed 0', method="linear",
                                  ).mark_line(color="red")

Reg_Params = kd.transform_regression('TOC', 'Unnamed 0', method="linear", params=True
                                    ).mark_text(align='left', lineBreak='\n').encode(
        x=alt.value(120),  # pixels from left
        y=alt.value(20),  # pixels from top
        text='params:N'
    ).transform_calculate(
        params='"r² = " + round(datum.rSquared * 100)/100 + \
    "      y = " + round(datum.coef[0] * 10)/10 + " + e ^ (" + round(datum.coef[1] * 10000)/10000 + "x" + ")" + \n + " "'
    )

kd + Reg_Line + Reg_Params