In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from lmfit import Model
from lmfit.models import LinearModel
from scipy import stats
import sys
sys.path.append(os.path.join(os.getcwd(), "functions"))
import helper_functions as hfn
import plotting_functions as plotfn
import inputfraction_functions as fracfn
from scipy import stats

# define data and save paths

In [2]:
path = os.getcwd()
datapath = os.path.join(path,'data')

rawpath = os.path.join(datapath, 'raw')

cleanpath = os.path.join(datapath, 'clean')

savepath = os.path.join(path, 'input_fraction')
hfn.dir_check(savepath)

savepath_fig, savepath_df, savepath_csv = hfn.create_subfolders(savepath)

# import data

In [3]:
areas, areas_with_pre, df_raw, df_all, df_v1, df_pm = hfn.import_main_dataset(cleanpath)

In [4]:
df_all.head()

Unnamed: 0_level_0,name,pre,input,target area,strain,starter,ratio_target_all,area,keep,PTLp,...,RSPd,RSPv,AM,LD,LP,LGd,ORB,ACA,MOs,CLA
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,BRAC2704_6c,2860,2729,V1,RETRO,2,1.0,PM/V1,1,76,...,100,172,23,82,7,268,59,69,42,2
1,ERAD25_1a,2548,2451,V1,GLT,3,1.0,V1,1,5,...,85,219,11,158,100,168,14,11,15,5
2,ERAD25_1b,9059,8418,PM,GLT,8,1.0,PM,1,256,...,670,1003,177,411,1041,193,312,118,33,45
10,BRAC2704_6d,8465,7486,V1,RETRO,13,1.0,V1,1,289,...,602,901,163,205,61,249,137,377,153,42
9,ERAE26_6e,7223,6860,PM,RBP,16,1.0,V1,1,153,...,552,497,26,197,576,191,246,93,50,32


In [5]:
df_all_fraction = fracfn.make_input_frac(df_all, areas).dropna()
df_v1_fraction = fracfn.make_input_frac(df_v1, areas).dropna()
df_pm_fraction = fracfn.make_input_frac(df_pm, areas).dropna()

In [6]:
df_all_ci = fracfn.make_convergence_index(df_all, areas).dropna()
df_v1_ci = fracfn.make_convergence_index(df_v1, areas).dropna()
df_pm_ci = fracfn.make_convergence_index(df_pm, areas).dropna()

In [7]:
df_v1_fraction.head()

Unnamed: 0_level_0,name,pre,input,target area,strain,starter,ratio_target_all,area,keep,PTLp,...,RSPd,RSPv,AM,LD,LP,LGd,ORB,ACA,MOs,CLA
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,ERAD25_1a,2548,2451,V1,GLT,3,1.0,V1,1,0.00204,...,0.03468,0.089351,0.004488,0.064463,0.0408,0.068543,0.005712,0.004488,0.00612,0.00204
10,BRAC2704_6d,8465,7486,V1,RETRO,13,1.0,V1,1,0.038605,...,0.080417,0.120358,0.021774,0.027384,0.008149,0.033262,0.018301,0.050361,0.020438,0.00561
9,ERAE26_6e,7223,6860,PM,RBP,16,1.0,V1,1,0.022303,...,0.080466,0.072449,0.00379,0.028717,0.083965,0.027843,0.03586,0.013557,0.007289,0.004665
3,ERAD26_3d,8882,8735,PM,GLT,17,1.0,V1,1,0.02324,...,0.038122,0.065484,0.005839,0.044076,0.08609,0.03984,0.023469,0.009616,0.005266,0.002404
4,ERAB23_1a,6119,5801,V1,TLX,18,1.0,V1,1,0.006206,...,0.024479,0.031202,0.000345,0.042234,0.068781,0.113946,0.017583,0.005861,0.001551,0.002758


In [8]:
def p_adjust_bh(p):
    """Benjamini-Hochberg p-value correction for multiple hypothesis testing."""
    p = np.asfarray(p)
    by_descend = p.argsort()[::-1]
    by_orig = by_descend.argsort()
    steps = float(len(p)) / np.arange(float(len(p)), 0, -1)
    q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend]))
    return q[by_orig]

In [9]:
df_lowst_frac = df_all_fraction[df_all_fraction.starter<200]
df_highst_frac = df_all_fraction[df_all_fraction.starter>200]

In [10]:
df_lowst_frac_ci = df_all_ci[df_all_ci.starter<200]
df_highst_frac_ci = df_all_ci[df_all_ci.starter>200]

In [11]:
df_v1_frac = df_all_fraction[df_all_fraction.area =='V1']
df_pm_frac = df_all_fraction[df_all_fraction.area =='PM']

In [12]:
df_v1_frac_high = df_highst_frac[df_highst_frac.area =='V1' ]
df_pm_frac_high = df_highst_frac[df_highst_frac.area =='PM']

In [13]:
summary_df_mean = pd.concat([pd.DataFrame(df_lowst_frac.mean(axis = 0), columns = ['low_frac']),
           pd.DataFrame(df_highst_frac.mean(axis = 0), columns = ['high_frac'])], axis = 1)

  summary_df_mean = pd.concat([pd.DataFrame(df_lowst_frac.mean(axis = 0), columns = ['low_frac']),
  pd.DataFrame(df_highst_frac.mean(axis = 0), columns = ['high_frac'])], axis = 1)


In [14]:
summary_df_mean

Unnamed: 0,low_frac,high_frac
pre,13859.133333,59946.076923
input,12986.266667,54495.230769
starter,69.133333,813.615385
ratio_target_all,0.997291,0.985401
keep,1.0,1.0
PTLp,0.026154,0.033636
TEa,0.015169,0.026918
VIS,0.337719,0.302476
AUD,0.032768,0.053387
RSP,0.183545,0.167364


In [15]:
summary_df_sd = pd.concat([pd.DataFrame(df_lowst_frac.std(axis = 0), columns = ['low_frac']),
           pd.DataFrame(df_highst_frac.mean(axis = 0), columns = ['high_frac'])], axis = 1)

  summary_df_sd = pd.concat([pd.DataFrame(df_lowst_frac.std(axis = 0), columns = ['low_frac']),
  pd.DataFrame(df_highst_frac.mean(axis = 0), columns = ['high_frac'])], axis = 1)


In [16]:
summary_targ_mean = pd.concat([pd.DataFrame(df_v1_frac.mean(axis = 0), columns = ['V1']),
           pd.DataFrame(df_pm_frac.mean(axis = 0), columns = ['PM'])], axis = 1)
summary_targ_sd = pd.concat([pd.DataFrame(df_v1_frac.std(axis = 0), columns = ['V1']),
           pd.DataFrame(df_pm_frac.std(axis = 0), columns = ['PM'])], axis = 1)

  summary_targ_mean = pd.concat([pd.DataFrame(df_v1_frac.mean(axis = 0), columns = ['V1']),
  pd.DataFrame(df_pm_frac.mean(axis = 0), columns = ['PM'])], axis = 1)
  summary_targ_sd = pd.concat([pd.DataFrame(df_v1_frac.std(axis = 0), columns = ['V1']),
  pd.DataFrame(df_pm_frac.std(axis = 0), columns = ['PM'])], axis = 1)


In [17]:
summary_targ_high_mean = pd.concat([pd.DataFrame(df_v1_frac_high.mean(axis = 0), columns = ['V1']),
           pd.DataFrame(df_pm_frac_high.mean(axis = 0), columns = ['PM'])], axis = 1)
summary_targ_high_sd = pd.concat([pd.DataFrame(df_v1_frac_high.std(axis = 0), columns = ['V1']),
           pd.DataFrame(df_pm_frac_high.std(axis = 0), columns = ['PM'])], axis = 1)

  summary_targ_high_mean = pd.concat([pd.DataFrame(df_v1_frac_high.mean(axis = 0), columns = ['V1']),
  pd.DataFrame(df_pm_frac_high.mean(axis = 0), columns = ['PM'])], axis = 1)
  summary_targ_high_sd = pd.concat([pd.DataFrame(df_v1_frac_high.std(axis = 0), columns = ['V1']),
  pd.DataFrame(df_pm_frac_high.std(axis = 0), columns = ['PM'])], axis = 1)


In [18]:
summary_targ_high_mean

Unnamed: 0,V1,PM
pre,64317.25,57474.285714
input,58923.75,51989.428571
starter,1355.5,589.571429
ratio_target_all,0.999216,0.974954
keep,1.0,1.0
PTLp,0.03598,0.035
TEa,0.030513,0.025569
VIS,0.332326,0.279095
AUD,0.063271,0.050547
RSP,0.125519,0.174985


In [19]:
summary_df_sd

Unnamed: 0,low_frac,high_frac
pre,10265.983266,59946.076923
input,9543.58943,54495.230769
starter,69.518617,813.615385
ratio_target_all,0.007329,0.985401
keep,0.0,1.0
PTLp,0.014283,0.033636
TEa,0.010818,0.026918
VIS,0.086429,0.302476
AUD,0.0164,0.053387
RSP,0.055593,0.167364


In [20]:
summary_df_mean_ci = pd.concat([pd.DataFrame(df_lowst_frac_ci.mean(axis = 0), columns = ['low_frac']),
           pd.DataFrame(df_highst_frac_ci.mean(axis = 0), columns = ['high_frac'])], axis = 1)
summary_df_sd_ci = pd.concat([pd.DataFrame(df_lowst_frac_ci.std(axis = 0), columns = ['low_frac']),
           pd.DataFrame(df_highst_frac_ci.std(axis = 0), columns = ['high_frac'])], axis = 1)

  summary_df_mean_ci = pd.concat([pd.DataFrame(df_lowst_frac_ci.mean(axis = 0), columns = ['low_frac']),
  pd.DataFrame(df_highst_frac_ci.mean(axis = 0), columns = ['high_frac'])], axis = 1)
  summary_df_sd_ci = pd.concat([pd.DataFrame(df_lowst_frac_ci.std(axis = 0), columns = ['low_frac']),
  pd.DataFrame(df_highst_frac_ci.std(axis = 0), columns = ['high_frac'])], axis = 1)


In [21]:
summary_df_mean_ci 

Unnamed: 0,low_frac,high_frac
pre,13859.133333,59946.076923
input,12986.266667,54495.230769
starter,69.133333,813.615385
ratio_target_all,0.997291,0.985401
keep,1.0,1.0
PTLp,10.869184,2.758009
TEa,5.081349,2.343353
VIS,163.626987,25.588575
AUD,13.501844,4.371175
RSP,82.207382,15.707534


In [22]:
summary_df_sd_ci

Unnamed: 0,low_frac,high_frac
pre,10265.983266,25396.496128
input,9543.58943,22143.229274
starter,69.518617,534.222416
ratio_target_all,0.007329,0.025759
keep,0.0,0.0
PTLp,11.531166,1.971763
TEa,4.54888,1.655258
VIS,161.716707,16.047523
AUD,14.613373,2.778197
RSP,74.563152,12.101406


In [23]:
df_all_fraction.to_csv(os.path.join(savepath_csv, 'df_all_fraction.csv'))
df_v1_fraction.to_csv(os.path.join(savepath_csv, 'df_v1_fraction.csv'))
df_pm_fraction.to_csv(os.path.join(savepath_csv, 'df_pm_fraction.csv'))

In [24]:
df_all_ci.to_csv(os.path.join(savepath_csv, 'df_all_ci.csv'))
df_v1_ci.to_csv(os.path.join(savepath_csv, 'df_v1_ci.csv'))
df_pm_ci.to_csv(os.path.join(savepath_csv, 'df_pm_ci.csv'))