# Create table containing number of seed genes, loci per system

In [1]:
# import matplotlib

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import sys
import os

# latex rendering of text in graphs
import matplotlib as mpl
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
import community
import ndex2
from tqdm import tqdm

from scipy.stats import mannwhitneyu

import mygene
mg = mygene.MyGeneInfo()

sns.set(font_scale=1.5)

sns.set_style('white')


In [3]:
cwd = os.path.dirname(os.path.dirname(os.getcwd()))
DATADIR = os.path.join(cwd, "Data/")
FIGDIR = os.path.join(cwd, "Figures/rerun_")

In [4]:
sys.path.append(cwd)
from analysis_functions import *

# load PCNet

In [18]:
pc_nodes, Gint = load_pcnet()

number of nodes:
18820

number of edges:
2693109


# Parse mapped genes from human and rat

In [19]:
BMI_GIANT_pascal = pd.read_csv(DATADIR + 'inputs/GIANT_BMI_pascal.sum.genescores.txt', sep='\t')
BMI_GIANT_pascal.index=BMI_GIANT_pascal['gene_symbol']
BMI_GIANT_pascal = BMI_GIANT_pascal.loc[list(np.intersect1d(BMI_GIANT_pascal.index.tolist(),pc_nodes))]
bonf_p = .05/len(BMI_GIANT_pascal)

h_bmi_genes = BMI_GIANT_pascal[BMI_GIANT_pascal['pvalue']<bonf_p].index.tolist()
print(len(h_bmi_genes))


1958


In [20]:
human_bmi_seed_df = BMI_GIANT_pascal.loc[h_bmi_genes]
human_bmi_seed_df.head()

Unnamed: 0_level_0,chromosome,start,end,strand,gene_id,gene_symbol,numSnps,pvalue,Status
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A3GALT2,chr1,33772366,33786699,-,127550,A3GALT2,171,1.284589e-07,DAVIES_SUCCESS
AADAT,chr4,170981372,171011372,-,51166,AADAT,269,7.579147e-07,DAVIES_SUCCESS
AARS,chr16,70286296,70323412,-,16,AARS,31,1.125766e-13,DAVIES_LOWPRECISION_FAREBROTHER_SUCCESS
AATK,chr17,79091095,79139872,-,9625,AATK,386,1e-12,DAVIES_LOWPRECISION_FAREBROTHER_FAIL
ABCB6,chr2,220074487,220085174,-,10058,ABCB6,112,4.647868e-07,DAVIES_SUCCESS


In [6]:
# set some parameters
ratThresh='relaxed' # relaxed or stringent

num_reps=1000

# set type of gene mapper
mapper = 'PASCAL' # can be PASCAL or PREDIXCAN

adj_type = 'bonf' # can be bonf or BH
# bonf_p = .05/len(BMI_GIANT_pascal) #0.25/len(BMI_GIANT_pascal)
# print(bonf_p)

excl_rat_seeds = False # if true, exclude rat seed genes from relevant phenotype

# read in rat BMI seed genes, corresponding to parameters above
rat_bmi_genes=pd.read_csv(DATADIR + 'inputs/ratBMI_seed_'+ratThresh+'.txt',sep='\t')['0'].tolist()
print(len(rat_bmi_genes))

295


In [41]:
rat_bmi_df = pd.read_csv(DATADIR + "inputs/rat_BMI_summary_stats_wo_tail.tsv", index_col=0,sep="\t") 
rat_bmi_df.index=rat_bmi_df['HumanGene']
rat_bmi_df.head()

Unnamed: 0_level_0,Gene,Chrom,Start-End,nSNPS,TopSNP P-Value,TopSNP Position,Binarized Heat,Negative Log,Size Adjusted Heat,HumanGene,Unnamed: 11
HumanGene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
PRLHR,Prlhr,chr1,281754471-281756159,23.0,7.31e-12,281756885.0,1.0,11.136172,4.861444,PRLHR,
FAM204A,Fam204a,chr1,281343691-281390632\t281343691-281403699\t2813...,141.0,3.47e-11,281411373.0,1.0,10.459144,4.861444,FAM204A,
,LOC108349713,chr1,281397475-281406785,62.0,3.47e-11,281411373.0,1.0,10.459144,4.861444,,
,LOC102556108,chr1,281289719-281301391\t281299202-281301391,33.0,3.7e-11,281303363.0,1.0,10.431224,4.861444,,
,LOC102556164,chr1,281227922-281318201\t281227922-281318201\t2812...,186.0,3.7e-11,281303363.0,1.0,10.431224,4.861444,,


In [8]:
rat_bmi_seed_df = rat_bmi_df.loc[rat_bmi_genes]
print(len(rat_bmi_seed_df))
rat_bmi_seed_df

295


Unnamed: 0_level_0,Gene,Chrom,Start-End,nSNPS,TopSNP P-Value,TopSNP Position,Binarized Heat,Negative Log,Size Adjusted Heat,HumanGene,Unnamed: 11
HumanGene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
PRLHR,Prlhr,chr1,281754471-281756159,23.0,7.310000e-12,281756885.0,1.0,11.136172,4.861444,PRLHR,
FAM204A,Fam204a,chr1,"281343691-281390632,281343691-281403699,281347...",141.0,3.470000e-11,281411373.0,1.0,10.459144,4.861444,FAM204A,
RAB11FIP2,Rab11fip2,chr1,281065345-281101163,141.0,1.700000e-10,281109613.0,1.0,9.770121,4.861444,RAB11FIP2,
CACUL1,Cacul1,chr1,"281814225-281874675,281816405-281874511",84.0,2.250000e-10,281811420.0,1.0,9.647775,4.861444,CACUL1,
ARSG,Arsg,chr10,"97722549-97859975,97733458-97859975,97733459-9...",355.0,7.620000e-10,97797154.0,1.0,9.117885,4.861444,ARSG,
...,...,...,...,...,...,...,...,...,...,...,...
PKIA,Pkia,chr2,"96593184-96608706,96593184-96608708,96593184-9...",283.0,9.330000e-05,96656161.0,0.0,4.029900,3.919947,PKIA,
GK5,Gk5,chr8,103929092-104001916,59.0,9.460000e-05,103969544.0,0.0,4.024207,3.916477,GK5,
MRC2,Mrc2,chr10,"93520271-93581599,93520435-93580474",259.0,9.680000e-05,93546093.0,0.0,4.013937,3.917748,MRC2,
ABR,Abr,chr10,64565120-64657079,48.0,9.830000e-05,64611901.0,0.0,4.007652,3.880712,ABR,


# Load the clustering results

In [31]:
# Load the clustering results
clust_df = pd.read_csv(DATADIR + "outputs/BMI_hierarchy_data.tsv", sep="\t", index_col=0)
clust_df["CD_MemberList_Size"] = clust_df.CD_MemberList.apply(lambda x: len(x.split(" ")))
print(len(clust_df))
clust_df.head()

61


Unnamed: 0,CD_MemberList,frac_d1_seeds,frac_d2_seeds,CD_MemberList_Size
C877,HELZ EMX2 HIST1H3A RAD51 SYT12 GYPC KDM3B ERC1...,0.353583,0.17757,642
C898,STX4 LSAMP SPOCK1 SGCB STXBP5 TRP-AGG2-6 GPRC5...,0.363636,0.181818,11
C904,BCL11B PKIB PKIA LMO1 NR2F1 LDB2 ZFPM2 GATA3 A...,0.3,0.2,10
C906,PROM2 SPINT1 TNIK PKP3 SSH3 FEZ2 CORIN WWC1 CAPN1,0.333333,0.111111,9
C911,ZNF629 ZKSCAN5 LINGO1 ZNF202 ZKSCAN2 ZNF263 ZN...,0.571429,0.142857,7


# ID which gene pairs are in the same genomic loci (within ~ 250KB)

In [10]:
rat_bmi_seed_df

Unnamed: 0_level_0,Gene,Chrom,Start-End,nSNPS,TopSNP P-Value,TopSNP Position,Binarized Heat,Negative Log,Size Adjusted Heat,HumanGene,Unnamed: 11
HumanGene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
PRLHR,Prlhr,chr1,281754471-281756159,23.0,7.310000e-12,281756885.0,1.0,11.136172,4.861444,PRLHR,
FAM204A,Fam204a,chr1,"281343691-281390632,281343691-281403699,281347...",141.0,3.470000e-11,281411373.0,1.0,10.459144,4.861444,FAM204A,
RAB11FIP2,Rab11fip2,chr1,281065345-281101163,141.0,1.700000e-10,281109613.0,1.0,9.770121,4.861444,RAB11FIP2,
CACUL1,Cacul1,chr1,"281814225-281874675,281816405-281874511",84.0,2.250000e-10,281811420.0,1.0,9.647775,4.861444,CACUL1,
ARSG,Arsg,chr10,"97722549-97859975,97733458-97859975,97733459-9...",355.0,7.620000e-10,97797154.0,1.0,9.117885,4.861444,ARSG,
...,...,...,...,...,...,...,...,...,...,...,...
PKIA,Pkia,chr2,"96593184-96608706,96593184-96608708,96593184-9...",283.0,9.330000e-05,96656161.0,0.0,4.029900,3.919947,PKIA,
GK5,Gk5,chr8,103929092-104001916,59.0,9.460000e-05,103969544.0,0.0,4.024207,3.916477,GK5,
MRC2,Mrc2,chr10,"93520271-93581599,93520435-93580474",259.0,9.680000e-05,93546093.0,0.0,4.013937,3.917748,MRC2,
ABR,Abr,chr10,64565120-64657079,48.0,9.830000e-05,64611901.0,0.0,4.007652,3.880712,ABR,


In [15]:
num_rat_loci=[]
num_rat_seeds=[]
for focal_clust in clust_df.index.tolist():
    print(focal_clust)

    focal_genes = clust_df['CD_MemberList'].loc[focal_clust].split(' ')

    if len(np.intersect1d(focal_genes,rat_bmi_genes))>0: # only compute loci if there are any seed genes in the cluster
        rat_bmi_seed_focal=rat_bmi_seed_df[rat_bmi_seed_df.index.isin(focal_genes)].dropna(subset=['HumanGene'])
        num_rat_seeds.append(len(rat_bmi_seed_focal))

        # how many seed genes are within 250 KB of each other?
        start_list = []
        end_list = []
        chr_list = []
        for g in rat_bmi_seed_focal.index.tolist():
            # take the min start and max end
            start_end_temp = rat_bmi_seed_focal.loc[g]['Start-End'].split(',')
            start_temp = [int(s.split('-')[0]) for s in start_end_temp]
            end_temp = [int(s.split('-')[1]) for s in start_end_temp]
            start_temp=min(start_temp)
            end_temp = max(end_temp)

            chr_temp = rat_bmi_seed_focal.loc[g]['Chrom']

            start_list.append(start_temp)
            end_list.append(end_temp)
            chr_list.append(chr_temp)

        df_temp = pd.DataFrame({'start':start_list,'end':end_list,'chr':chr_list},index=rat_bmi_seed_focal.index.tolist())


        # loop over all gene pairs, compute distance between start and end, when on same chromosome
        g1list = []
        g2list = []
        dist_list = []
        chrom1list = []
        chrom2list = []

        genes_temp = df_temp.index.tolist()
        for i in np.arange(len(genes_temp)-1):
            g1 = genes_temp[i]
            chr1 = df_temp.loc[g1]['chr']
            start1 = df_temp.loc[g1]['start']
            end1 = df_temp.loc[g1]['end']
            for j in np.arange(i+1,len(genes_temp)): 
                g2 = genes_temp[j]
                chr2 = df_temp.loc[g2]['chr']
                start2 = df_temp.loc[g2]['start']
                end2 = df_temp.loc[g2]['end']

                if chr1==chr2:
                    # check if they overlap
                    if (start2<end1)&(start1<start2):
                        dist_list.append(0)
                    elif (start1<end2)&(start2<start1):
                        dist_list.append(0)
                    elif (start2-end1)>0: # check which is ahead
                        dist_list.append(start2-end1)
                    else:
                        dist_list.append(start1-end2)
                else:
                    dist_list.append(np.nan) # nans if they are diff chrom

                g1list.append(g1)
                g2list.append(g2)
                chrom1list.append(chr1)
                chrom2list.append(chr2)

        dist_df_temp = pd.DataFrame({'g1':g1list,'g2':g2list,
                                     'chr1':chrom1list,'chr2':chrom2list,
                                     'dist':dist_list})



        dist_df_temp['within_250KB']=dist_df_temp['dist']<250000
        elist_temp = zip(dist_df_temp[dist_df_temp['within_250KB']==True]['g1'],dist_df_temp[dist_df_temp['within_250KB']==True]['g2'])

        # make a network from this
        G_dist_temp = nx.from_edgelist(elist_temp)
        # number communities + number genes not in a shared locus ==> total number of loci
        num_rat_loci.append(len(pd.Series(community.best_partition(G_dist_temp), dtype="object").value_counts())+len(np.setdiff1d(genes_temp,G_dist_temp.nodes())))
    else:
        num_rat_loci.append(0)
        num_rat_seeds.append(0)



C877
C898
C904
C906
C911
C913
C914
C915
C924
C925
C926
C930
C931
C933
C935
C937
C878
C880
C882
C885
C887
C888
C889
C890
C892
C895
C883
C879
C932
C901
C902
C907
C909
C884
C893
C894
C896
C929
C899
C934
C903
C881
C916
C919
C921
C920
C905
C922
C910
C923
C886
C912
C897
C891
C908
C900
C928
C917
C918
C927
C936


In [23]:
human_bmi_seed_df.head()

Unnamed: 0_level_0,chromosome,start,end,strand,gene_id,gene_symbol,numSnps,pvalue,Status
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A3GALT2,chr1,33772366,33786699,-,127550,A3GALT2,171,1.284589e-07,DAVIES_SUCCESS
AADAT,chr4,170981372,171011372,-,51166,AADAT,269,7.579147e-07,DAVIES_SUCCESS
AARS,chr16,70286296,70323412,-,16,AARS,31,1.125766e-13,DAVIES_LOWPRECISION_FAREBROTHER_SUCCESS
AATK,chr17,79091095,79139872,-,9625,AATK,386,1e-12,DAVIES_LOWPRECISION_FAREBROTHER_FAIL
ABCB6,chr2,220074487,220085174,-,10058,ABCB6,112,4.647868e-07,DAVIES_SUCCESS


In [24]:
human_bmi_seed_focal

Unnamed: 0_level_0,chromosome,start,end,strand,gene_id,gene_symbol,numSnps,pvalue,Status
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AATK,,,,,,AATK,,,
ABCB8,,,,,,ABCB8,,,
ABL1,,,,,,ABL1,,,
ADORA2B,,,,,,ADORA2B,,,
AGAP1,,,,,,AGAP1,,,
...,...,...,...,...,...,...,...,...,...
ZKSCAN5,,,,,,ZKSCAN5,,,
ZNF142,,,,,,ZNF142,,,
ZNF629,,,,,,ZNF629,,,
ZNF768,,,,,,ZNF768,,,


In [26]:
# compute same metrics for human loci
num_human_loci=[]
num_human_seeds = []
for focal_clust in clust_df.index.tolist():
    print(focal_clust)

    focal_genes = clust_df['CD_MemberList'].loc[focal_clust].split(' ')
    
    if len(np.intersect1d(focal_genes,h_bmi_genes))>0: # only compute loci if there are any seed genes in the cluster
        human_bmi_seed_focal=human_bmi_seed_df[human_bmi_seed_df.index.isin(focal_genes)].dropna(subset=['gene_symbol'])
        num_human_seeds.append(len(human_bmi_seed_focal))

        # how many seed genes are within 250 KB of each other?
        start_list = []
        end_list = []
        chr_list = []
        for g in human_bmi_seed_focal.index.tolist():
            start_list.append(int(human_bmi_seed_focal.loc[g]['start']))
            end_list.append(int(human_bmi_seed_focal.loc[g]['end']))
            chr_list.append(human_bmi_seed_focal.loc[g]['chromosome'])

        df_temp = pd.DataFrame({'start':start_list,'end':end_list,'chr':chr_list},index=human_bmi_seed_focal.index.tolist())


        # loop over all gene pairs, compute distance between start and end, when on same chromosome
        g1list = []
        g2list = []
        dist_list = []
        chrom1list = []
        chrom2list = []

        genes_temp = df_temp.index.tolist()
        for i in np.arange(len(genes_temp)-1):
            g1 = genes_temp[i]
            chr1 = df_temp.loc[g1]['chr']
            start1 = df_temp.loc[g1]['start']
            end1 = df_temp.loc[g1]['end']
            for j in np.arange(i+1,len(genes_temp)): 
                g2 = genes_temp[j]
                chr2 = df_temp.loc[g2]['chr']
                start2 = df_temp.loc[g2]['start']
                end2 = df_temp.loc[g2]['end']

                if chr1==chr2:
                    # check if they overlap
                    if (start2<end1)&(start1<start2):
                        dist_list.append(0)
                    elif (start1<end2)&(start2<start1):
                        dist_list.append(0)
                    elif (start2-end1)>0: # check which is ahead
                        dist_list.append(start2-end1)
                    else:
                        dist_list.append(start1-end2)
                else:
                    dist_list.append(np.nan) # nans if they are diff chrom

                g1list.append(g1)
                g2list.append(g2)
                chrom1list.append(chr1)
                chrom2list.append(chr2)

        dist_df_temp = pd.DataFrame({'g1':g1list,'g2':g2list,
                                     'chr1':chrom1list,'chr2':chrom2list,
                                     'dist':dist_list})



        dist_df_temp['within_250KB']=dist_df_temp['dist']<250000
        elist_temp = zip(dist_df_temp[dist_df_temp['within_250KB']==True]['g1'],dist_df_temp[dist_df_temp['within_250KB']==True]['g2'])

        # make a network from this
        G_dist_temp = nx.from_edgelist(elist_temp)

        # number communities + number genes not in a shared locus ==> total number of loci

        num_human_loci.append(len(pd.Series(community.best_partition(G_dist_temp), dtype="object").value_counts())+len(np.setdiff1d(genes_temp,G_dist_temp.nodes())))
    else:
        num_human_loci.append(0)
        num_human_seeds.append(0)



C877
C898
C904
C906
C911
C913
C914
C915
C924
C925
C926
C930
C931
C933
C935
C937
C878
C880
C882
C885
C887
C888
C889
C890
C892
C895
C883
C879
C932
C901
C902
C907
C909
C884
C893
C894
C896
C929
C899
C934
C903
C881
C916
C919
C921
C920
C905
C922
C910
C923
C886
C912
C897
C891
C908
C900
C928
C917
C918
C927
C936


# Compile everything into a table

In [27]:
clust_loci_df = pd.DataFrame({'num_human_seeds':num_human_seeds,'num_human_loci':num_human_loci,
                             'num_rat_seeds':num_rat_seeds,'num_rat_loci':num_rat_loci},index=clust_df.index.tolist())
clust_loci_df.head()

Unnamed: 0,num_human_seeds,num_human_loci,num_rat_seeds,num_rat_loci
C877,227,162,114,55
C898,4,4,2,2
C904,3,3,2,2
C906,3,3,1,1
C911,4,3,1,1


In [32]:
clust_loci_df['num_genes_in_cluster']=clust_df['CD_MemberList_Size']
clust_loci_df.head()

Unnamed: 0,num_human_seeds,num_human_loci,num_rat_seeds,num_rat_loci,num_genes_in_cluster
C877,227,162,114,55,642
C898,4,4,2,2,11
C904,3,3,2,2,10
C906,3,3,1,1,9
C911,4,3,1,1,7


In [33]:
clust_loci_df['human_loci_per_seed']=clust_loci_df['num_human_loci']/clust_loci_df['num_human_seeds']
clust_loci_df['rat_loci_per_seed']=clust_loci_df['num_rat_loci']/clust_loci_df['num_rat_seeds']
clust_loci_df.sort_values('num_genes_in_cluster',ascending=False).head(25)

Unnamed: 0,num_human_seeds,num_human_loci,num_rat_seeds,num_rat_loci,num_genes_in_cluster,human_loci_per_seed,rat_loci_per_seed
C877,227,162,114,55,642,0.713656,0.482456
C878,71,67,36,31,214,0.943662,0.861111
C879,57,55,31,27,169,0.964912,0.870968
C880,60,49,27,14,157,0.816667,0.518519
C881,31,27,12,7,68,0.870968,0.583333
C882,16,15,11,10,53,0.9375,0.909091
C883,9,8,3,3,34,0.888889,1.0
C884,10,10,5,5,32,1.0,1.0
C885,11,11,6,6,31,1.0,1.0
C886,8,8,6,5,27,1.0,0.833333


In [35]:
# annotate with the community names
annotations = pd.read_csv(DATADIR + "outputs/BMI_hierarchy_curated_annotations.tsv", sep="\t", index_col=1)
annotations.head()

Unnamed: 0_level_0,annotation
community,Unnamed: 1_level_1
C877,Conserved BMI Network (root)
C878,Nervous System Development
C879,Synaptic Signaling
C880,Chromosome Organization
C881,mRNA Metabolic Process


In [40]:
# save the table
clust_loci_df.join(annotations).sort_index().to_csv(DATADIR + 'outputs/loci_per_community.tsv', sep="\t")