In [280]:
import pandas as pd
import altair as alt
import numpy as np
import scipy.stats as stats

In [281]:
def prepare_df(df):
    df = df[[col for col in df.columns if 'MonoHydro_' in col or 'XL_' in col]]
    df = pd.melt(df)
    df['variable'] = df['variable'].str.replace('\[|\]','', regex=True)
    split = df['variable'].str.split("_", expand=True)
    df["link_type"] = split[0]
    df['uID'] = np.where(df['link_type'] == 'XL', 'sp|P01024|CO3_HUMAN:' + split[2] +':x:sp|P01024|CO3_HUMAN:' + split[4], 'sp|P01024|CO3_HUMAN:' + split[2])
    df['uID_rev'] = np.where(df['link_type'] == 'XL', 'sp|P01024|CO3_HUMAN:' + split[4] +':x:sp|P01024|CO3_HUMAN:' + split[2], 'sp|P01024|CO3_HUMAN:' + split[2])
    return df


In [282]:
def get_log2_df(df_c3, df_c3b):
    df_merge = pd.merge(df_c3, df_c3b, on=['variable', 'link_type', 'uID', 'uID_rev'], suffixes=['_c3', '_c3b'])
    df_merge['log2ratio'] = np.log2(df_merge['value_c3b']/df_merge['value_c3'])
    df_merge = df_merge.replace([np.inf, -np.inf], np.nan).dropna().reset_index(drop=True)
    return df_merge


In [283]:
def get_delta_dist(x, ref_exp='c3', exp='c3b', metric='SASD'):
    if len(x) == 2:
        d_ref_exp = x[x['exp_name'] == ref_exp][metric].values[0]
        d_exp = x[x['exp_name'] == exp][metric].values[0]
        return d_exp - d_ref_exp
    else:
        return None

In [320]:
df_c3 = pd.read_csv('c3_final_frame_asa.csv')
df_c3b = pd.read_csv('c3b_final_frame_asa.csv')
df_dist = pd.read_csv('/home/kai/Projects/c3_v2/xlink_energies/energy_dist_combined.csv')
#df_xtract = pd.read_csv('/home/kai/Projects/c3_v2/xtract_results_ld28/KK_c3_v2.analyzer.quant.xls', delimiter='\t').rename(columns={'type': 'link_type'})
#df_xtract =df_xtract[df_xtract['sign'] == '==']
df_xtract = pd.read_csv('/home/kai/Projects/c3_v2/xtract_results_ld28/xtract_out_from_bagcontainer_sky.csv')

In [321]:
df_c3_melt = prepare_df(df_c3)
df_c3b_melt = prepare_df(df_c3b)

In [322]:
df_c3_melt['exp'] = 'c3'
df_c3b_melt['exp'] = 'c3b'
df_concat = pd.concat([df_c3_melt, df_c3b_melt])

In [323]:
df_concat.head()

Unnamed: 0,variable,value,link_type,uID,uID_rev,exp
0,MonoHydro_C3_65,0.602385,MonoHydro,sp|P01024|CO3_HUMAN:65,sp|P01024|CO3_HUMAN:65,c3
1,MonoHydro_C3_66,0.523816,MonoHydro,sp|P01024|CO3_HUMAN:66,sp|P01024|CO3_HUMAN:66,c3
2,MonoHydro_C3_73,0.787114,MonoHydro,sp|P01024|CO3_HUMAN:73,sp|P01024|CO3_HUMAN:73,c3
3,MonoHydro_C3_97,1.0,MonoHydro,sp|P01024|CO3_HUMAN:97,sp|P01024|CO3_HUMAN:97,c3
4,MonoHydro_C3_100,1.0,MonoHydro,sp|P01024|CO3_HUMAN:100,sp|P01024|CO3_HUMAN:100,c3


In [324]:
alt.Chart(df_concat).mark_bar().encode(
    x='variable',
    y=alt.Y('value', stack='zero'),
    row='link_type',
    color='exp'
).resolve_scale(x='independent', y='independent')

In [325]:
df_merge = get_log2_df(df_c3_melt, df_c3b_melt)
df_merge.head()

Unnamed: 0,variable,value_c3,link_type,uID,uID_rev,exp_c3,value_c3b,exp_c3b,log2ratio
0,MonoHydro_C3_65,0.602385,MonoHydro,sp|P01024|CO3_HUMAN:65,sp|P01024|CO3_HUMAN:65,c3,0.592872,c3b,-0.02296551
1,MonoHydro_C3_66,0.523816,MonoHydro,sp|P01024|CO3_HUMAN:66,sp|P01024|CO3_HUMAN:66,c3,0.208406,c3b,-1.329663
2,MonoHydro_C3_73,0.787114,MonoHydro,sp|P01024|CO3_HUMAN:73,sp|P01024|CO3_HUMAN:73,c3,1.0,c3b,0.3453548
3,MonoHydro_C3_97,1.0,MonoHydro,sp|P01024|CO3_HUMAN:97,sp|P01024|CO3_HUMAN:97,c3,1.0,c3b,5.857978e-11
4,MonoHydro_C3_100,1.0,MonoHydro,sp|P01024|CO3_HUMAN:100,sp|P01024|CO3_HUMAN:100,c3,1.0,c3b,5.857946e-11


In [10]:
alt.Chart(df_merge[(df_merge['log2ratio'] > 1) | (df_merge['log2ratio'] < -1)]).mark_bar().encode(
    x='variable',
    y=alt.Y('log2ratio', stack='zero'),
    row='link_type',
).resolve_scale(x='independent', y='independent')

In [326]:
alt.Chart(df_merge[(df_merge['log2ratio'] > 1) | (df_merge['log2ratio'] < -1)]).mark_bar().encode(
    x='variable',
    y=alt.Y('log2ratio', stack='zero'),
    row='link_type',
).resolve_scale(x='independent', y='independent')

In [126]:
df_merge

Unnamed: 0,variable,value_c3,link_type,exp_c3,value_c3b,exp_c3b,log2ratio,uID,uID_rev
0,MonoHydro_C3_65,0.549918,MonoHydro,c3,0.556052,c3b,1.600443e-02,sp|P01024|CO3_HUMAN:65,sp|P01024|CO3_HUMAN:65
1,MonoHydro_C3_66,0.503554,MonoHydro,c3,0.136081,c3b,-1.887680e+00,sp|P01024|CO3_HUMAN:66,sp|P01024|CO3_HUMAN:66
2,MonoHydro_C3_73,0.746094,MonoHydro,c3,1.000000,c3b,4.225709e-01,sp|P01024|CO3_HUMAN:73,sp|P01024|CO3_HUMAN:73
3,MonoHydro_C3_97,1.000000,MonoHydro,c3,1.000000,c3b,-1.541377e-11,sp|P01024|CO3_HUMAN:97,sp|P01024|CO3_HUMAN:97
4,MonoHydro_C3_100,1.000000,MonoHydro,c3,1.000000,c3b,-1.692771e-11,sp|P01024|CO3_HUMAN:100,sp|P01024|CO3_HUMAN:100
...,...,...,...,...,...,...,...,...,...
159,XL_C3_1522_C3_1535,0.238768,XL,c3,0.135111,c3b,-8.214656e-01,sp|P01024|CO3_HUMAN:1522:x:sp|P01024|CO3_HUMAN...,sp|P01024|CO3_HUMAN:1535:x:sp|P01024|CO3_HUMAN...
160,XL_C3_1522_C3_1595,0.247315,XL,c3,0.112652,c3b,-1.134482e+00,sp|P01024|CO3_HUMAN:1522:x:sp|P01024|CO3_HUMAN...,sp|P01024|CO3_HUMAN:1595:x:sp|P01024|CO3_HUMAN...
161,XL_C3_1526_C3_1535,0.317638,XL,c3,0.320088,c3b,1.108789e-02,sp|P01024|CO3_HUMAN:1526:x:sp|P01024|CO3_HUMAN...,sp|P01024|CO3_HUMAN:1535:x:sp|P01024|CO3_HUMAN...
162,XL_C3_1551_C3_1599,0.566580,XL,c3,0.594380,c3b,6.910656e-02,sp|P01024|CO3_HUMAN:1551:x:sp|P01024|CO3_HUMAN...,sp|P01024|CO3_HUMAN:1599:x:sp|P01024|CO3_HUMAN...


In [233]:
df_xtract.head()

Unnamed: 0,uID,experiment,link_type,ms1_area_sum,ms1_area_sum_ref,log2ratio,log2avg,referenceexperiment,pvalue,FDR
0,sp|P01024|CO3_HUMAN:1036,c3b,monolink,22.996,32.5336,-9.53754,27.7648,c3,1.3e-05,7e-05
1,sp|P01024|CO3_HUMAN:1041:x:sp|P01024|CO3_HUMAN...,c3b,xlink,32.5444,31.4862,1.05819,32.0153,c3,0.000404,0.001121
2,sp|P01024|CO3_HUMAN:1041:x:sp|P01024|CO3_HUMAN...,c3b,xlink,23.7509,22.5223,1.22864,23.1366,c3,0.018367,0.034208
3,sp|P01024|CO3_HUMAN:104:x:sp|P01024|CO3_HUMAN:66,c3b,xlink,26.792,28.4879,-1.6959,27.6399,c3,1.9e-05,9.1e-05
4,sp|P01024|CO3_HUMAN:104:x:sp|P01024|CO3_HUMAN:97,c3b,xlink,24.9138,23.476,1.43778,24.1949,c3,0.044861,0.076832


In [212]:
dfm = pd.merge(df_merge[['uID', 'log2ratio']], df_xtract[['uID', 'log2ratio', 'link_type']], on=['uID'], suffixes=['_sim', '_exp'])
dfm_rev = pd.merge(df_merge[['uID_rev', 'log2ratio']].rename(columns={'uID_rev': 'uID'}), df_xtract[['uID', 'log2ratio', 'link_type']], on=['uID'], suffixes=['_sim', '_exp'])
dfm = pd.concat([dfm, dfm_rev]).drop_duplicates().reset_index(drop=True)

In [215]:
dfm.head()

Unnamed: 0,uID,log2ratio_sim,log2ratio_exp,link_type
0,sp|P01024|CO3_HUMAN:65,0.01600443,0.196816,monolink
1,sp|P01024|CO3_HUMAN:66,-1.88768,0.204363,monolink
2,sp|P01024|CO3_HUMAN:104,-0.2706743,-0.186791,monolink
3,sp|P01024|CO3_HUMAN:155,0.2421696,-0.718886,monolink
4,sp|P01024|CO3_HUMAN:176,-1.693491e-11,-1.305498,monolink


In [216]:
dfmm = pd.melt(dfm, value_vars=['log2ratio_exp', 'log2ratio_sim'], id_vars=['uID', 'link_type'])

In [217]:
dfmm.head()

Unnamed: 0,uID,link_type,variable,value
0,sp|P01024|CO3_HUMAN:65,monolink,log2ratio_exp,0.196816
1,sp|P01024|CO3_HUMAN:66,monolink,log2ratio_exp,0.204363
2,sp|P01024|CO3_HUMAN:104,monolink,log2ratio_exp,-0.186791
3,sp|P01024|CO3_HUMAN:155,monolink,log2ratio_exp,-0.718886
4,sp|P01024|CO3_HUMAN:176,monolink,log2ratio_exp,-1.305498


In [218]:
alt.Chart(dfmm).mark_bar().encode(
    x='uID',
    y=alt.Y('value', stack='zero'),
    color='variable',
    row='link_type'
).resolve_scale(x='independent', y='independent')

In [275]:
fil_mono = ((dfm['link_type'] == 'monolink') & ((dfm['log2ratio_sim'] > 0.5) | (dfm['log2ratio_sim'] < -0.5))) 
fil_xl = (dfm['link_type'] == 'xlink')
c = alt.Chart(dfm[fil_mono | fil_xl]).mark_point().encode(
    x='log2ratio_sim',
    y=alt.Y('log2ratio_exp'),
)
c = c + c.transform_regression('log2ratio_sim', 'log2ratio_exp').mark_line()
c.facet(
    row='link_type'
).resolve_scale(x='independent', y='independent')


In [276]:
stats.linregress(dfm[fil_mono]['log2ratio_sim'], dfm[fil_mono]['log2ratio_exp'])

LinregressResult(slope=-0.7430302706721233, intercept=-2.311760594901846, rvalue=-0.24151285778388792, pvalue=0.15588651916281102, stderr=0.5120077256107524)

In [277]:
# & (dfm['log2ratio_exp'] < 6)  & (dfm['log2ratio_exp'] > -4)
stats.linregress(dfm[fil_xl]['log2ratio_sim'], dfm[fil_xl]['log2ratio_exp'])

LinregressResult(slope=-0.8856287384927995, intercept=0.3236324695984335, rvalue=-0.5369910435462883, pvalue=0.0015304186713055177, stderr=0.25401209462421154)

In [222]:
df_delta_dist = df_dist.groupby('uxid').apply(get_delta_dist).dropna()
df_delta_dist = pd.DataFrame(df_delta_dist).reset_index()
df_delta_dist = df_delta_dist.rename(columns={'uxid': 'uID', 0: 'delta_dist'})


In [223]:
len(df_delta_dist)

57

In [224]:
dfdd = pd.merge(df_delta_dist, df_xtract[['uID', 'log2ratio', 'link_type']], on=['uID'])

In [225]:
c = alt.Chart(dfdd).mark_point().encode(
    x='delta_dist',
    y=alt.Y('log2ratio'),
)
c = c + c.transform_regression('delta_dist', 'log2ratio').mark_line()
c.facet(
    row='link_type'
).resolve_scale(x='independent', y='independent')

In [226]:
stats.linregress(dfdd['delta_dist'], dfdd['log2ratio'])

LinregressResult(slope=-0.07154834514828232, intercept=0.3399420033049684, rvalue=-0.2000186334443865, pvalue=0.2352483948829394, stderr=0.05924190916497382)

In [234]:
df_concat.head(1)

Unnamed: 0,variable,value,link_type,uID,uID_rev,exp
0,MonoHydro_C3_65,0.549918,MonoHydro,sp|P01024|CO3_HUMAN:65,sp|P01024|CO3_HUMAN:65,c3


In [235]:
df_xtract.head(1)

Unnamed: 0,uID,experiment,link_type,ms1_area_sum,ms1_area_sum_ref,log2ratio,log2avg,referenceexperiment,pvalue,FDR
0,sp|P01024|CO3_HUMAN:1036,c3b,monolink,22.996,32.5336,-9.53754,27.7648,c3,1.3e-05,7e-05


In [316]:
dfs = pd.merge(df_concat[['uID', 'value', 'exp']], df_xtract[['uID', 'link_type', 'ms1_area_sum', 'ms1_area_sum_ref']], on=['uID'])
dfs_rev = pd.merge(df_concat[['uID_rev', 'value', 'exp']].rename(columns={'uID_rev': 'uID'}), df_xtract[['uID', 'link_type', 'ms1_area_sum', 'ms1_area_sum_ref']], on=['uID'])
dfs = pd.concat([dfs, dfs_rev]).drop_duplicates().reset_index(drop=True)
dfs = dfs[~(dfs['value'] >= 0.99)].reset_index(drop=True)

In [317]:
dfs_c3 = dfs[dfs['exp'] == 'c3']
dfs_c3b = dfs[dfs['exp'] == 'c3b']
dfs_c3.head()

Unnamed: 0,uID,value,exp,link_type,ms1_area_sum,ms1_area_sum_ref
0,sp|P01024|CO3_HUMAN:65,0.592877,c3,monolink,30.4247,30.1038
2,sp|P01024|CO3_HUMAN:66,0.503993,c3,monolink,30.3158,29.9674
4,sp|P01024|CO3_HUMAN:155,0.077073,c3,monolink,33.0811,33.6388
6,sp|P01024|CO3_HUMAN:176,0.963919,c3,monolink,23.3706,27.2277
8,sp|P01024|CO3_HUMAN:249,0.310477,c3,monolink,17.3588,17.4437


In [318]:
c = alt.Chart(dfs).mark_point().encode(
    x='value',
    y=alt.Y('ms1_area_sum_ref'),
)
c = c + c.transform_regression('value', 'ms1_area_sum_ref').mark_line()
c.facet(
    row='link_type'
).resolve_scale(x='independent', y='independent')


In [319]:
fil = (dfs_c3['link_type'] == 'xlink')# & (dfm['log2ratio_exp'] < 6)  & (dfm['log2ratio_exp'] > -4)
stats.linregress(dfs_c3[fil]['value'], dfs_c3[fil]['ms1_area_sum_ref'])

LinregressResult(slope=10.674928290896126, intercept=26.95082561901486, rvalue=0.4981695966092295, pvalue=0.0005790720995377208, stderr=2.8669629241204144)

In [248]:
fil = (dfs_c3b['link_type'] == 'xlink')# & (dfm['log2ratio_exp'] < 6)  & (dfm['log2ratio_exp'] > -4)
stats.linregress(dfs_c3b[fil]['value'], dfs_c3b[fil]['ms1_area_sum_ref'])

LinregressResult(slope=9.577474450839858, intercept=26.97064594798719, rvalue=0.4190477990704991, pvalue=0.004637064200470645, stderr=3.202075043861065)