In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import altair as alt
from scipy import stats

try:  # if on phy-server local modules will not be found if their directory is not added to PATH
    import sys
    sys.path.append("/silod7/lenz/MPSchleiSediments/analysis/")
    import os
    os.chdir("/silod7/lenz/MPSchleiSediments/analysis/")
except Exception:
    pass

import prepare_data
import KDE_utils
import correlations
from pcoa import sed_pcoa
from pca import sed_pca
import glm
from settings import Config

# alt.renderers.enable('altair_viewer')  # use to display altair charts externally in browser instead of inline (only activate in non-vega-compatible IDE like pycharm)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Load and prepare data
**Before running, check the computational parameters in settings.py**

Nomenclature:
- *pdd*: particle domain data = data is particle based, meaning one entry (row) corresponds to one particle and one feature (column) corresponds to one property observed for that particle
- *sdd*: sample domain data = data is sample based, meaning one entry (row) corresponds to one sample and one feature (column) corresponds to one property observed for that sample
- *mp*: data on microplastics
- *sed*: data on sediments

In [2]:
# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = pd.read_csv('../csv/env_MP_clean_list_SchleiSediments.csv', index_col=0)

# Also import sediment data (sediment frequencies per size bin from master sizer export)
sed_sdd = pd.read_csv('../csv/sediment_grainsize.csv')

In [3]:
# Get the binning structure of the imported sediment data and optionally rebin it (make binning coarser) for faster computation
sed_sdd, sed_lower_boundaries = prepare_data.sediment_preps(sed_sdd)

In [4]:
# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
mp_sdd = prepare_data.aggregate_SDD(mp_pdd)  # TODO: Schlei_S29 has a sampling_mass of 0 in MPDB. Manually set here now to 0.25 kg in imported csv file, but needs to be checked and corrrected in DB and the csv recreated from DB.
mp_added_sed_sdd = prepare_data.add_sediment(mp_sdd)
# mp_added_metadata_pdd = prepare_data.sdd2pdd(mp_added_sed_sdd, mp_pdd)  # TODO: not used. Remove?

## MP particle size distributions

*This calculates probability density distributions for MP sizes. These are used to estimate the abundances of MP of specific size ranges, which can then be investigated for correlations to environmental parameters such as TOC or sediment grain size properties.*
**Before running, check the computational parameters in settings.py**

In [5]:
# KDEs for probability of finding a MP particle in a specific size bin are calculated.
size_pdfs = KDE_utils.per_sample_kde(mp_pdd, sed_lower_boundaries)

Schlei_S8:    bandwidth is 50                        

In [6]:
# Get MP concentrations for each single-step size bin
mp_size_conc = KDE_utils.probDens2conc(size_pdfs, mp_sdd)

In [7]:
# MP and sediment abundances per size bin are equalised (only samples and bins contained in both of the are kept).
# They are then melted and merged into MPsedMelted.
mp_size_conc, sed_sdd, mp_sed_melt = prepare_data.equalise_mp_and_sed(mp_size_conc, sed_sdd)

## Sediment size bin dimension reduction

In [9]:
# PCA

# SEDpc, SEDexp, SEDload = sed_pca(SEDext.T)
sedpc, sedexp, sedload = sed_pca(sed_sdd)

# To see a loadings plot use:
# sedload_df = pd.DataFrame(sedload, columns=['PC1', 'PC2', 'PC3'], index = np.unique(np.concatenate(np.char.split(sed_size_freqs.index.values.astype(str), '_')).ravel().astype(int))[:-1]).T
# sedload_df.T.plot()  # x-axis corresponds to lower boundary of size bins

In [10]:
# PCoA

sedpco = sed_pcoa(sed_sdd, num_coords = 2)

# Plot PCo1 and PCo2
alt.Chart(sedpco.merge(mp_added_sed_sdd, left_index=True, right_on='Sample').reset_index()).mark_point().encode(
    x='PC1',
    y='smaller63',
    color='D50',
    tooltip='index'
)

  warn(


Proportion explained: 
 PC1    0.784746
PC2    0.113954
dtype: float64    Total: exp.sum 0.8987002342375902


## Modelling

In [11]:
# Create a generalized linear model (GLM) to predict the microplastic concentration from the predictors

glm_input = mp_added_sed_sdd.merge(sedpco, left_on='Sample', right_index=True)

glm_res = glm.glm(glm_input)
print(glm_res.summary())
glm_res.predict(glm_input.iloc[[0]])


<statsmodels.genmod.families.family.Gamma object at 0x7fe466edfd00>
                 Generalized Linear Model Regression Results                  
Dep. Variable:          Concentration   No. Observations:                   31
Model:                            GLM   Df Residuals:                       28
Model Family:                   Gamma   Df Model:                            2
Link Function:          inverse_power   Scale:                          1.7661
Method:                          IRLS   Log-Likelihood:                -279.02
Date:                Sat, 18 Dec 2021   Deviance:                       42.195
Time:                        22:28:35   Pearson chi2:                     49.5
No. Iterations:                     8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------



0    2453.555881
dtype: float64

In [17]:
from statsmodels.sandbox.tools.cross_val import LeaveOneOut, split

In [22]:
loo = LeaveOneOut(mp_added_sed_sdd.shape[0])
for train_index, test_index in loo:
    # this could be used for doing loo-cross-val, if using array-based instead formula-and-dataframe based model:
    #X_train, X_test, y_train, y_test = cross_val.split(train_index, test_index, X, y)  
    
    current_input = glm_input.loc[train_index,:]
    glm_res = glm.glm(current_input)
    # TODO: predict... something like pred = glm_res.predict(glm_input.loc[test_index,:]
    # TODO: get error (RMSE?) from prediction

## MP and sediment size range combination correlations

In [None]:
# Sum abundances in size bins for all possible combinations. Basically create all possible rebinnings and stack them into one DF
MPext = prepare_data.combination_sums(mp_size_conc.copy().T).T  # TODO: using transposed df's here, because combination_sums is not yet turned around: it takes features in rows and samples in columns...
SEDext= prepare_data.combination_sums(sed_sdd.copy().T)

### Altair size range correlation (MP~Sed)

In [None]:
MPsedExt = prepare_data.merge_size_ranges(MPext, 'MP', SEDext, 'SED', cart_prod=True)

In [None]:
brush1 = alt.selection_interval(name="brush1", encodings=['y'])
brush2 = alt.selection_interval(name="brush2", encodings=['x'])

step = sed_lower_boundaries[1] - sed_lower_boundaries[0]

scatter = alt.Chart(MPsedExt
).transform_calculate(
    b1l=f'floor((isDefined(brush1.y) ? (brush1.y[0][0]) : 1) / {step}) * {step}',
    b1u=f'ceil((isDefined(brush1.y) ? (brush1.y[0][1]) : 1) / {step}) * {step}',
    b2l=f'floor((isDefined(brush2.x) ? (brush2.x[0][0]) : 1) / {step}) * {step}',
    b2u=f'ceil((isDefined(brush2.x) ? (brush2.x[0][1]) : 1) / {step}) * {step}',
).mark_point().encode(
    x = 'SED',
    y = 'MP',
    tooltip = 'sample:N'
).transform_filter(
    '(datum.lower_MP >= datum.b1l) &&'
    '(datum.upper_MP <= datum.b1u) &&'
    '(datum.lower_SED >= datum.b2l) &&'
    '(datum.upper_SED <= datum.b2u)'
)
    

RegLine = scatter.transform_regression(
    'SED', 'MP', method="linear",
).mark_line(
    color="red"
)


RegParams = scatter.transform_regression(
    'SED', 'MP', method="linear", params=True
).mark_text(align='left', lineBreak='\n').encode(
    x=alt.value(120),  # pixels from left
    y=alt.value(20),  # pixels from top
    text='params:N'
).transform_calculate(
    params='"r² = " + round(datum.rSquared * 100)/100 + \
    "      y = " + round(datum.coef[1] * 100)/100 + "x" + " + " + round(datum.coef[0] * 10)/10'
)


MP = alt.Chart(MPsedExt).mark_line().encode(
    x = 'mean(MP)',
    y = alt.X('lower_MP', scale=alt.Scale(type='linear'))
).transform_filter(
    '(datum.lower_MP == datum.lower_SED) && \
     (datum.upper_MP == datum.upper_SED)'
).add_selection(
    brush1
).properties(
    height = 300,
    width = 100
)


sed = alt.Chart(MPsedExt).mark_line().encode(
    x = alt.X('lower_SED', scale=alt.Scale(type='linear')),
    y = 'mean(SED)'
).transform_filter(
    '(datum.lower_MP == datum.lower_SED) &&'
    '(datum.upper_MP == datum.upper_SED)'
).add_selection(
    brush2
).properties(
    height = 100,
    width = 400
)


MP | (scatter + RegLine + RegParams) & sed

# chart = MP | scatter & sed
# chart#.save('chart.json')

### Size range correlation matrix (MP~Sed)

In [None]:
# Calculate a correlation matrix containing Pearson correlation coefficients for all combinations of any original or summed bins of MP and sediments.
corrMat = np.corrcoef(MPext,SEDext)
corrMat = corrMat[:len(MPext), len(SEDext):]  # only take upper right quadrant of correlation matrix

corrMatDF = pd.DataFrame(corrMat, index=MPext.index, columns=SEDext.index)  # turn np array into df
# corrMatDF.rename('MP_{}'.format, axis=0, inplace=True)  # add a prefix for 'MP' to each row label
# corrMatDF.rename('SED_{}'.format, axis=1, inplace=True)  # add a prefix for 'sediment' to each column label
corrMatDF.index.name = 'MP'
corrMatDF.columns.name = 'SED'

### Cross-Correlation

In [None]:
# Run a cross-correlation for each sample between MP and sediment size distribution curve shapes.
crosscorr_results = pd.DataFrame()

for label, content in df_range_conc.items():
    datax = content
    datay = sed_sdd[label]
    #best = correlations.crosscorr(datax, datay)
        
#     lags = range(-int(len(datax)/2), int(len(datax)/2)+1)
    lags = range(-50,0)
    df_r = pd.DataFrame(lags, columns = ['shifted'])
    
#     r_pear = [datax.corr(datay.shift(lag)) for lag in lags]
#     df_r['pearson_r'] = r_pear
    r_spear = [stats.spearmanr(np.array(datax),np.array(datay.shift(lag)), nan_policy = 'omit') for lag in lags]
    df_r['spearman_r'] = r_spear
    
    
    best = df_r.copy().loc[df_r['spearman_r'] == df_r['spearman_r'].max()]
    best['Sample'] = label
    crosscorr_results = crosscorr_results.append(best)

## MP-to-Scalar correlation (e.g. TOC)

In [None]:
alt.Chart(mp_added_sed_sdd).mark_point().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    y = 'Concentration',
    color = 'Regio_Sep',
    tooltip = 'Sample'
).repeat(
    column=['D50', 'TOC', 'Dist_WWTP', 'Mass', 'Split']
).interactive()

In [13]:
import plotly.express as px
fig = px.scatter_3d(mp_added_sed_sdd, x='Dist_WWTP', y='D50', z='Concentration',
                    color='TOC', symbol='regio_sep', hover_name="Sample",
                    color_continuous_scale=px.colors.sequential.turbid_r)

# resize markers
fig.update_traces(marker=dict(size=4,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

# change size and move colorbar
fig.update_layout(autosize=False, width=1000, height=800,
                  coloraxis_colorbar=dict(yanchor="top", y=1, x=0, ticks="outside"))
fig.show()

In [None]:
# Calculate correlation between all size bins (or their summed up combinations) of MP abundances and a scalar predictor.
# Possible predictors are: 'TOC', 'Hg', 'Dist_WWTP' or different sediment grain size properties (D50, below 63, etc.). See column labels of sdd_MP_sed for a full list.
bestLower, bestUpper, df_r = correlations.predictorcorr(df_range_conc, mp_added_sed_sdd.set_index('Sample'), 'TOC')

In [None]:
# Heatmap correlation coefficients from the single predictor correlation
alt.renderers.enable('html')

alt.Chart(df_r).mark_point().encode(
    x='lower_size',
    y='upper_size',
    color=alt.Color("r", scale=alt.Scale(domain=[0.5, 0.6])),
    tooltip=['r', 'lower_size', 'upper_size']
).properties(
    width=800,
    height=800
)

In [None]:
#####USED only for testing external function inside NB: #####
step = (Config.upper_size_limit - Config.lower_size_limit) / Config.kde_steps
df_r = pd.DataFrame(columns=['lower_size', 'upper_size', 'r', 'p'])

for i in [10]:
    for j in [1000]:
        size_sum = size_pdfs.loc[(size_pdfs.x_d >= i) & (size_pdfs.x_d < j)].sum()
        size_sum.drop('x_d', inplace=True)
        range_prob = size_sum * step
        range_conc = range_prob * mp_added_sed_sdd.set_index('Sample').Concentration

        r = stats.pearsonr(range_conc, mp_added_sed_sdd.set_index('Sample').TOC)
        df_r.loc[len(df_r)] = [i, j, r[0], r[1]]
        print(f'Correlating TOC with size range            [{i},        {j}]                ', end="\r", flush=True)

print(df_r.loc[df_r.r == df_r.r.max()])
bestLower, bestUpper = df_r.loc[df_r.r == df_r.r.max()].iloc[0, 0:2]

In [None]:
# Property-property-plot with MP against single predictor incl. reg line 
kd = alt.Chart(pd.DataFrame([range_conc, mp_added_sed_sdd.set_index('Sample').TOC]).T.reset_index()).mark_point().encode(
    x='TOC',
    y='Unnamed 0',
    color='index',
    tooltip='index'
)

Reg_Line = kd.transform_regression('TOC', 'Unnamed 0', method="linear",
                                  ).mark_line(color="red")

Reg_Params = kd.transform_regression('TOC', 'Unnamed 0', method="linear", params=True
                                    ).mark_text(align='left', lineBreak='\n').encode(
        x=alt.value(120),  # pixels from left
        y=alt.value(20),  # pixels from top
        text='params:N'
    ).transform_calculate(
        params='"r² = " + round(datum.rSquared * 100)/100 + \
    "      y = " + round(datum.coef[0] * 10)/10 + " + e ^ (" + round(datum.coef[1] * 10000)/10000 + "x" + ")" + \n + " "'
    )

kd + Reg_Line + Reg_Params

In [None]:
daf = sed_sdd.reset_index().melt(id_vars='index', value_name='value')
daf['size'] = daf['index'].str.split('_').str[0]
daf['category'] = 'sediment'

daff = df_range_conc.reset_index().melt(id_vars='index', value_name='value')
daff['size'] = daf['index'].str.split('_').str[0]
daff['category'] = 'MP'

df = pd.concat([daf, daff]).drop(columns=['index'])
# df = daf.merge(daff, on =['sample', 'size', 'index']).drop(columns=['index'])

In [None]:
input_dropdown = alt.binding_select(options=list(df['sample'].unique()), name='select sample ')
selection = alt.selection_single(fields=['sample'],
                                 bind=input_dropdown,
                                 #init={'select_': 'Schlei_S8'}
                                )

base = alt.Chart(df).mark_line().encode(
    x=alt.X('size:Q'),
    y=alt.Y('value:Q'),
    color='category',
#     shape='category',
#     detail='category'
# ).transform_window(
#     cuml='sum(value)',
#     groupby=['sample']
).transform_filter(
    alt.FieldRangePredicate(field='size', range=[50, 990])
# ).properties(
#     width=180,
#     height=180
# ).facet(
#     facet='sample',
#     columns=6
# ).resolve_scale(
#     y='independent'
)


mps = base.mark_line(color='blue').transform_filter(
    alt.FieldEqualPredicate(field='category', equal='MP')
)
seds = base.mark_line(color='yellow').transform_filter(
    alt.FieldEqualPredicate(field='category', equal='sediment')
)

alt.layer(mps, seds).resolve_scale(
    y = 'independent'
).add_selection(
    selection
).transform_filter(
    selection
)


In [None]:
base = alt.Chart(df).encode(
    alt.X('size:Q', axis=alt.Axis(title=None))
).properties(
    width=180,
    height=180
)

mps = base.mark_line(stroke='#57A44C', interpolate='monotone').encode(
    alt.Y('MP:Q', axis=alt.Axis(title='MP', titleColor='#57A44C'))
)

seds = base.mark_line(stroke='#5276A7', interpolate='monotone').encode(
    alt.Y('sediment:Q', axis=alt.Axis(title='sediment', titleColor='#5276A7'))
)

alt.layer(mps, seds).resolve_scale(
    y = 'independent'
).properties(
    width=180,
    height=180
).facet(
    facet='sample',
    columns=6
).resolve_scale(
    y='independent'
)

In [None]:
df = daf.shift(100).merge(daff, on =['sample', 'size', 'index']).drop(columns=['index'])
alt.Chart(df).mark_point().encode(
    x=alt.X('value_x', scale=alt.Scale(type='linear')),
    y=alt.X('value_y', scale=alt.Scale(type='linear')),
    color=alt.Color('size:Q', scale=alt.Scale(scheme="viridis"))
).properties(
    width=150,
    height=150
).facet(
    facet='sample',
    columns=6
).resolve_scale(
    y='independent',
    x='independent'
)

In [None]:
data = pd.DataFrame({
    'x': range(9),
    'y': [1, 2, 3, 4, 5, 4, 3, 2, 1]
})

brush = alt.selection_interval(name="brush", encodings=['x'])
alt.Chart(data).mark_line().add_selection(brush).transform_calculate(
    scaled_by_brush_width='datum.y * (isDefined(brush.x) ? (brush.x[1] - brush.x[0]) : 1)'
).encode(
    x='x:Q',
    y='scaled_by_brush_width:Q'
)