In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import time

In [2]:
zs = pd.read_table('cnv_constraint_zscores_20190430.tsv', names=['Gene','z','pLI'], index_col='Gene')
zs.head()

Unnamed: 0_level_0,z,pLI
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1
BRCA2,3.402038,0.991096
BRCA1,2.570469,0.984025
APC,2.085995,0.945645
ATM,1.242243,0.989235
MSH2,1.240663,0.988295


In [3]:
sets = pd.read_table('/oak/stanford/groups/jpriest/cnv_ukb/resources/HPO_pheno_to_gene.txt', skiprows=0,
                     names=['hpoID','hpolabel','geneID','Gene'])
sets.head()

Unnamed: 0,hpoID,hpolabel,geneID,Gene
0,HP:0001459,1-3 toe syndactyly,2737,GLI3
1,HP:0006088,1-5 finger complete cutaneous syndactyly,64327,LMBR1
2,HP:0010708,1-5 finger syndactyly,6469,SHH
3,HP:0010708,1-5 finger syndactyly,64327,LMBR1
4,HP:0010713,1-5 toe syndactyly,2737,GLI3


In [4]:
sets['hpoID'].value_counts().shape

(8524,)

In [5]:
load = True
if load:
    hpo_t = pd.read_table("cnv_constraint_hpo_enrichment_20190430.tsv")
else:
    hpo_t = pd.DataFrame()
    t = time.time()
    for hpo in sets['hpoID'].value_counts().index:
        # select rows
        in_set = sets.query('hpoID == @hpo and Gene in @zs.index')
        out_of_set = sets.query('hpoID != @hpo and Gene in @zs.index')
        if in_set.shape[0] == 0 or out_of_set.shape[0] == 0:
            continue
        # select genes
        in_set_z = zs.loc[in_set['Gene'],'z']
        out_of_set_z = zs.loc[out_of_set['Gene'],'z']
        # compute mean shift, t-test p-value
        deltaZ = in_set_z.mean() - out_of_set_z.mean()
        _,p = ttest_ind(in_set_z.values, out_of_set_z.values, equal_var=False, nan_policy='omit')
        # append
        hpo_t = hpo_t.append([[hpo, in_set.iloc[0,:]['hpolabel'], deltaZ, p, in_set_z.shape[0]]])
        if hpo_t.shape[0] % 200 == 0:
            print(hpo_t.shape[0], time.time()-t)
            t=time.time()
    hpo_t.columns = ['hpoID','hpoLabel','deltaZ','pValue','nGenes']
    hpo_t.to_csv('cnv_constraint_hpo_enrichment_20190430.tsv', sep='\t')

hpo_t.head()

Unnamed: 0,hpoID,hpoLabel,deltaZ,pValue,nGenes
0,HP:0000707,Abnormality of the nervous system,-0.007672,0.009988,2619
1,HP:0012638,Abnormality of nervous system physiology,-0.007689,0.014391,2433
2,HP:0000152,Abnormality of head or neck,-0.004238,0.233858,2115
3,HP:0000234,Abnormality of the head,-0.004255,0.235575,2094
4,HP:0000924,Abnormality of the skeletal system,-0.006123,0.088247,2047


In [6]:
hpo_t['pValue'] = hpo_t['pValue'].astype(float)

In [7]:
hpo_t.query('nGenes >= 5 and deltaZ > 0').sort_values('pValue').head(20)

Unnamed: 0,hpoID,hpoLabel,deltaZ,pValue,nGenes
1178,HP:0002034,Abnormality of the rectum,0.091344,0.005468,74
910,HP:0100006,Neoplasm of the central nervous system,0.084712,0.005929,107
452,HP:0011792,Neoplasm by histology,0.065458,0.006319,228
549,HP:0002027,Abdominal pain,0.084144,0.006739,177
1321,HP:0100834,Neoplasm of the large intestine,0.206365,0.006946,67
1612,HP:0100273,Neoplasm of the colon,0.27743,0.007167,49
904,HP:0010787,Genital neoplasm,0.133163,0.008309,102
730,HP:0008069,Neoplasm of the skin,0.064637,0.008358,140
973,HP:0100242,Sarcoma,0.084586,0.008506,103
573,HP:0000953,Hyperpigmentation of the skin,0.079611,0.008726,178


In [8]:
hpo_t.query('nGenes >= 5 and deltaZ < 0').sort_values('pValue').head(50)

Unnamed: 0,hpoID,hpoLabel,deltaZ,pValue,nGenes
225,HP:0000648,Optic atrophy,-0.028272,5.787128e-13,430
651,HP:0012072,Aciduria,-0.036503,3.258807e-12,155
614,HP:0001344,Absent speech,-0.034383,2.544988e-10,158
208,HP:0012795,Abnormality of the optic disc,-0.026035,3.251769e-10,467
1019,HP:0009136,Duplication involving bones of the feet,-0.044283,4.473983e-09,92
832,HP:0003355,Aminoaciduria,-0.035638,7.014999e-09,115
1048,HP:0001829,Foot polydactyly,-0.04361,1.315878e-08,88
545,HP:0010442,Polydactyly,-0.032821,4.196554e-08,182
330,HP:0005280,Depressed nasal bridge,-0.02634,4.609781e-08,303
1119,HP:0001162,Postaxial hand polydactyly,-0.035859,6.691444e-08,80


In [9]:
hpo_t.query('deltaZ > 0 and pValue < 0.01').sort_values('deltaZ', ascending=False).head(20)

Unnamed: 0,hpoID,hpoLabel,deltaZ,pValue,nGenes
6280,HP:0006778,Benign genitourinary tract neoplasm,1.165369,0.004459,2
6141,HP:0006719,Benign gastrointestinal tract tumors,1.165369,0.004459,2
6137,HP:0030410,Sebaceous gland carcinoma,1.165369,0.004459,2
5318,HP:0006758,Malignant genitourinary tract tumor,1.073453,0.007271,3
1612,HP:0100273,Neoplasm of the colon,0.27743,0.007167,49
1321,HP:0100834,Neoplasm of the large intestine,0.206365,0.006946,67
904,HP:0010787,Genital neoplasm,0.133163,0.008309,102
708,HP:0007379,Neoplasm of the genitourinary tract,0.098904,0.009468,139
1093,HP:0100568,Neoplasm of the endocrine system,0.098258,0.0095,86
1178,HP:0002034,Abnormality of the rectum,0.091344,0.005468,74


In [10]:
hpo_t.query('deltaZ < 0 and pValue < 0.01').sort_values('deltaZ').head(10)

Unnamed: 0,hpoID,hpoLabel,deltaZ,pValue,nGenes
4717,HP:0006549,Unilateral primary pulmonary dysgenesis,-0.17575,0.0,4
4742,HP:0011590,Double aortic arch,-0.17575,0.0,4
5626,HP:0006699,Premature atrial contractions,-0.17575,0.0,3
6223,HP:0040266,Proximal upper limb muscle hypertrophy,-0.175749,0.0,2
6210,HP:0040265,Upper limb muscle hypertrophy,-0.175749,0.0,2
6529,HP:0040278,Prolactinoma,-0.175749,0.0,2
6682,HP:0040025,Clinodactyly of the 4th finger,-0.175749,0.0,2
6098,HP:0040217,Elevated hemoglobin A1c,-0.175749,0.0,2
5512,HP:0012398,Peripheral edema,-0.169074,0.000428,3
3965,HP:0001664,Torsade de pointes,-0.158521,9.3e-05,7


In [11]:
hpo_t.query('deltaZ > 0 and pValue < 0.01 and nGenes < 100').sort_values('deltaZ', ascending=False).head(10)

Unnamed: 0,hpoID,hpoLabel,deltaZ,pValue,nGenes
6137,HP:0030410,Sebaceous gland carcinoma,1.165369,0.004459,2
6141,HP:0006719,Benign gastrointestinal tract tumors,1.165369,0.004459,2
6280,HP:0006778,Benign genitourinary tract neoplasm,1.165369,0.004459,2
5318,HP:0006758,Malignant genitourinary tract tumor,1.073453,0.007271,3
1612,HP:0100273,Neoplasm of the colon,0.27743,0.007167,49
1321,HP:0100834,Neoplasm of the large intestine,0.206365,0.006946,67
1093,HP:0100568,Neoplasm of the endocrine system,0.098258,0.0095,86
1178,HP:0002034,Abnormality of the rectum,0.091344,0.005468,74
5364,HP:0011939,3-4 finger cutaneous syndactyly,0.076933,0.000902,3
5129,HP:0011784,Thyrotoxicosis with diffuse goiter,0.036587,0.009869,2


In [13]:
go_t = pd.read_table('cnv_burden_go-enrichment_20190430.tsv', 
                     names=['goID','deltaZ','pValue','goLabel','nGenes']).dropna()
go_t.head()

Unnamed: 0,goID,deltaZ,pValue,goLabel,nGenes
3,GO:0018916,0.391665,0.000324,nitrobenzene metabolic process,5
4,GO:0034199,0.02187,0.43941,activation of protein kinase A activity,7
5,GO:0034198,0.051697,0.225169,cellular response to amino acid starvation,50
6,GO:0034197,0.054295,0.821962,triglyceride transport,9
8,GO:0098831,0.032418,0.397019,presynaptic active zone cytoplasmic component,14


In [15]:
go_t.query('nGenes >= 5 and deltaZ > 0').sort_values('pValue').head(10)

Unnamed: 0,goID,deltaZ,pValue,goLabel,nGenes
6229,GO:0000137,0.408569,7.143106e-30,Golgi cis cisterna,26
3072,GO:0045095,0.259425,8.197704e-30,keratin filament,95
6877,GO:0031436,1.456164,2.18188e-28,BRCA1-BARD1 complex,9
15679,GO:0005515,0.070732,5.522635000000001e-23,protein binding,78231
12187,GO:0000800,0.464194,1.476577e-21,lateral element,14
15366,GO:0031424,0.181551,1.501647e-20,keratinization,182
855,GO:0032301,1.098655,7.979989e-20,MutSalpha complex,6
5029,GO:0008194,0.352454,1.064619e-18,UDP-glycosyltransferase activity,22
17039,GO:0052697,0.471476,4.533896e-18,xenobiotic glucuronidation,15
11256,GO:0070200,0.976704,1.5781910000000002e-17,establishment of protein localization to telomere,7


In [18]:
go_t.query('deltaZ > 0 and pValue < 0.01 and nGenes < 100').sort_values('deltaZ', ascending=False).head(10)

Unnamed: 0,goID,deltaZ,pValue,goLabel,nGenes
8108,GO:0010484,1.721401,1.3263899999999999e-48,H3 histone acetyltransferase activity,2
15169,GO:1990426,1.708483,7.182819999999999e-48,mitotic recombination-dependent replication fo...,2
6877,GO:0031436,1.456164,2.18188e-28,BRCA1-BARD1 complex,9
9336,GO:0044818,1.307222,5.650977000000001e-28,mitotic G2/M transition checkpoint,2
4539,GO:0032137,1.232403,7.028652e-25,guanine/thymine mispair binding,2
855,GO:0032301,1.098655,7.979989e-20,MutSalpha complex,6
11256,GO:0070200,0.976704,1.5781910000000002e-17,establishment of protein localization to telomere,7
15436,GO:0051572,0.877742,1.83089e-18,negative regulation of histone H3-K4 methylation,3
11344,GO:0043009,0.876516,2.060368e-18,chordate embryonic development,3
5148,GO:0033600,0.844449,2.7414740000000003e-22,negative regulation of mammary gland epithelia...,4


In [22]:
go_t.query('deltaZ > 0 and pValue < 0.01 and nGenes > 10').sort_values('deltaZ', ascending=False).head(10)

Unnamed: 0,goID,deltaZ,pValue,goLabel,nGenes
1698,GO:0070531,0.837759,3.654163e-14,BRCA1-A complex,19
8578,GO:0005131,0.529813,2.687257e-13,growth hormone receptor binding,17
3157,GO:0072425,0.506647,7.990895e-07,signal transduction involved in G2 DNA damage ...,16
17039,GO:0052697,0.471476,4.533896e-18,xenobiotic glucuronidation,15
12187,GO:0000800,0.464194,1.476577e-21,lateral element,14
4589,GO:0006298,0.464057,1.623888e-06,mismatch repair,58
15548,GO:0016446,0.429991,6.515977e-14,somatic hypermutation of immunoglobulin genes,11
17038,GO:0052696,0.4238,6.560641e-11,flavonoid glucuronidation,13
12826,GO:0042954,0.42332,9.526873e-15,lipoprotein transporter activity,11
6074,GO:0070383,0.419576,2.883237e-06,DNA cytosine deamination,16


In [16]:
go_t.query('nGenes >= 5 and deltaZ < 0').sort_values('pValue').head(10)

Unnamed: 0,goID,deltaZ,pValue,goLabel,nGenes
561,GO:0070507,-0.006137,0.019911,regulation of microtubule cytoskeleton organiz...,31
2109,GO:0060307,-0.010503,0.039781,regulation of ventricular cardiac muscle cell ...,22
17161,GO:0038191,-0.004804,0.042861,neuropilin binding,20
14471,GO:0042297,-0.051704,0.044773,vocal learning,8
105,GO:0006627,-0.0572,0.05936,protein processing involved in protein targeti...,8
902,GO:0006346,-0.011886,0.071145,methylation-dependent chromatin silencing,13
7838,GO:0051247,-0.019965,0.076872,positive regulation of protein metabolic process,12
3455,GO:0004707,-0.015312,0.077174,MAP kinase activity,46
970,GO:0031731,-0.062286,0.078628,CCR6 chemokine receptor binding,6
9233,GO:0005007,-0.038676,0.089501,fibroblast growth factor-activated receptor ac...,18


In [23]:
go_t.query('nGenes >= 5 and deltaZ < 0').sort_values('deltaZ').head(10)

Unnamed: 0,goID,deltaZ,pValue,goLabel,nGenes
1621,GO:0003720,-0.096531,0.447253,telomerase activity,15
11367,GO:2001027,-0.084196,0.139595,negative regulation of endothelial cell chemot...,5
13633,GO:0004514,-0.080284,0.489368,nicotinate-nucleotide diphosphorylase (carboxy...,7
7771,GO:0000703,-0.079864,0.352417,oxidized pyrimidine nucleobase lesion DNA N-gl...,5
10134,GO:0004415,-0.079809,0.185115,hyalurononglucosaminidase activity,28
10784,GO:0004348,-0.077592,0.379018,glucosylceramidase activity,5
11309,GO:0061408,-0.075166,0.337678,positive regulation of transcription from RNA ...,12
5863,GO:0086011,-0.072555,0.098663,membrane repolarization during action potential,15
970,GO:0031731,-0.062286,0.078628,CCR6 chemokine receptor binding,6
105,GO:0006627,-0.0572,0.05936,protein processing involved in protein targeti...,8
