# Fisher's tests

Unfortunately I started this notebook thinking I would only test a few gene lists, so wasn't too worried about repeating code. However, I ended up testing more gene lists than expected and so there is a lot of repeated code here. I should have made this notebook cleaner with no repeated code.

# Do the features overlap with COSMIC?

In [3]:
# read in COSMIC
import pandas as pd
cosmic = pd.read_csv('Census_allFri Oct  8 15_32_31 2021.csv')
cosmic['ensembl_id'] = cosmic['Synonyms'].str.extract(r'(ENSG\d{11})') # extract ensembl id
cosmic

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),...,Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms,ensembl_id
0,A1CF,APOBEC1 complementation factor,29974.0,10:50799421-50885675,2,,11.23,yes,,melanoma,...,,E,,oncogene,Mis,,,,"29974,A1CF,ACF,ACF64,ACF65,APOBEC1CF,ASP,ENSG0...",ENSG00000148584
1,ABI1,abl-interactor 1,10006.0,10:26746593-26860935,1,Yes,12.10,yes,,AML,...,,L,Dom,"TSG, fusion",T,KMT2A,,,"10006,ABI-1,ABI1,E3B1,ENSG00000136754.17,Q8IZP...",ENSG00000136754
2,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25.0,9:130713946-130885683,1,Yes,34.12,yes,,"CML, ALL, T-ALL",...,,L,Dom,"oncogene, fusion","T, Mis","BCR, ETV6, NUP214",,,"25,ABL,ABL1,ENSG00000097007.17,JTK7,P00519,c-A...",ENSG00000097007
3,ABL2,"c-abl oncogene 2, non-receptor tyrosine kinase",27.0,1:179099327-179229601,1,,25.20,yes,,AML,...,,L,Dom,"oncogene, fusion",T,ETV6,,,"27,ABL2,ABLL,ARG,ENSG00000143322.19,P42684",ENSG00000143322
4,ACKR3,atypical chemokine receptor 3,57007.0,2:236569641-236582358,1,Yes,37.30,yes,,lipoma,...,,M,Dom,"oncogene, fusion",T,HMGA2,,,"57007,ACKR3,CMKOR1,CXCR7,ENSG00000144476.5,GPR...",ENSG00000144476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,ZNF429,zinc finger protein 429,353088.0,19:21505564-21538078,2,,12.00,yes,,GBM,...,,O,,,Mis,,,,"353088,ENSG00000197013.9,Q86V71,ZNF429",ENSG00000197013
719,ZNF479,zinc finger protein 479,90827.0,7:57119614-57139864,2,,11.20,yes,,"lung cancer, bladder carcinoma, prostate carci...",...,,E,,,Mis,,,,"90827,ENSG00000185177.12,KR19,Q96JC4,ZNF479",ENSG00000185177
720,ZNF521,zinc finger protein 521,25925.0,18:25061926-25352152,1,,11.20,yes,,ALL,...,,L,Dom,"oncogene, fusion",T,PAX5,,,"25925,EHZF,ENSG00000198795.10,Evi3,Q96K83,ZNF521",ENSG00000198795
721,ZNRF3,zinc and ring finger 3,84133.0,22:28883592-29057487,2,,12.10,yes,,"colorectal cancer, adrenocortical carcinoma, g...",...,,E,,TSG,"N, F, Mis",,,,"84133,BK747E2.3,ENSG00000183579.15,FLJ22057,KI...",ENSG00000183579


In [None]:
# read in kidney genes
kidney_features = pd.read_csv('../figs_xgboost/xgboost_models/features_mapped_to_genes_ensembl.csv', header=None)[0]
kidney_features


In [None]:
# get overlapping genes
cosmic_overlap = cosmic[cosmic['ensembl_id'].isin(kidney_features)]

# also check for genes that didn't have an ensembl id
kidney_features_symbols = pd.read_csv('../figs_xgboost/xgboost_models/features_mapped_to_genes.csv', header=None)[0]
kidney_features_symbols

(cosmic[cosmic['ensembl_id'].isna()]['Gene Symbol'].isin(kidney_features_symbols)).sum() # none of them, so don't need to add any


In [None]:
cosmic_overlap.shape[0] # 190 kidney genes are in cosmic 
cosmic_overlap.shape[0]/cosmic.shape[0] # we capture over 25% of cosmic genes
cosmic_overlap.shape[0]/len(kidney_features) # around 3.7% of kidney genes are in cosmic

In [None]:
# now lets do a fishers test

# for this, we need the kidney gene background
kidney_background = pd.read_csv('../figs_xgboost/xgboost_models/input_features_for_background_gene_list_ensembl_ids.csv', header=None)[0]
kidney_background

# now lets make the 2x2 contingency matrix

kidney_features.isin(kidney_background).mean() # check, should be 1

len(kidney_background)
len(kidney_features)

len(cosmic['ensembl_id'])

cosmic_in_bg = cosmic['ensembl_id'][cosmic['ensembl_id'].isin(kidney_background)] # need just parts of cosmic that are in background
len(cosmic_in_bg)

cosmic_overlap['ensembl_id'].isin(kidney_background).mean() # should be 1


import numpy as np
cosmic_not_kidney = cosmic_in_bg[np.logical_not(cosmic_in_bg.isin(kidney_features))]

kidney_not_cosmic = kidney_features[np.logical_not(kidney_features.isin(cosmic_in_bg))]

not_cosmic_not_kidney = kidney_background[np.logical_not((kidney_background.isin(cosmic_in_bg)) | (kidney_background.isin(kidney_features)))]


contingency = pd.DataFrame({
                'cosmic': [len(cosmic_overlap['ensembl_id']), len(cosmic_not_kidney)], 
                'not_cosmic': [len(kidney_not_cosmic), len(not_cosmic_not_kidney)] 
              })
contingency.index = ['kidney', 'not_kidney']
contingency


from scipy.stats import fisher_exact
oddsr, p = fisher_exact(contingency, alternative='greater')
print('Oddsr is ', oddsr, ' and p is ', p)


In [None]:
# checking contingency maths is right
len(cosmic_in_bg)
190+463

len(kidney_features)
190+4821

len(kidney_background)
190+4821+463+22724

So yes, the features do have a significant overlap with COSMIC with p=4.676e-13

There are 190 genes in both kidney genes and COSMIC genes

# dbEMT overlap

In [32]:
import pandas as pd
dbemt = pd.read_csv('dbEMTv2.csv')

dbemt['ensembl_id'] = dbemt['Links'].str.extract(r'Ensembl:(ENSG\d{11})')[0]
# was_na = dbemt['ensembl_id'].isna()
# dbemt[was_na]

# manually add missing ensembl ids
dbemt.loc[dbemt['GeneSymbol'] == 'DUXAP9', 'ensembl_id'] = 'ENSG00000225210'
dbemt.loc[dbemt['GeneSymbol'] == 'HAS2-AS1', 'ensembl_id'] = 'ENSG00000248690'
dbemt.loc[dbemt['GeneSymbol'] == 'MALAT1', 'ensembl_id'] = 'ENSG00000251562'
dbemt.loc[dbemt['GeneSymbol'] == 'MUC2', 'ensembl_id'] = 'ENSG00000198788'
dbemt.loc[dbemt['GeneSymbol'] == 'NANOGP8', 'ensembl_id'] = 'ENSG00000255192'
dbemt.loc[dbemt['GeneSymbol'] == 'TUG1', 'ensembl_id'] = 'ENSG00000253352'
dbemt.loc[dbemt['GeneSymbol'] == 'TUSC7', 'ensembl_id'] = 'ENSG00000243197'
dbemt.loc[dbemt['GeneSymbol'] == 'ZEB1-AS1', 'ensembl_id'] = 'ENSG00000237036'

# dbemt[was_na]
dbemt['ensembl_id'].isna().sum() # should be 0



0

In [33]:
dbemt

Unnamed: 0,GeneID,GeneSymbol,Alias,Links,Cytoband,FullName,GeneType,Unnamed: 7,Unnamed: 8,Unnamed: 9,ensembl_id
0,5243,ABCB1,ABC20|CD243|CLCS|GP170|MDR1|P-GP|PGY1,MIM:171050|HGNC:HGNC:40|Ensembl:ENSG0000008556...,7q21.12,ATP binding cassette subfamily B member 1,protein-coding,,,http://dbemt.bioinfo-minzhao.org/,ENSG00000085563
1,8714,ABCC3,ABC31|EST90757|MLP2|MOAT-D|MRP3|cMOAT2,MIM:604323|HGNC:HGNC:54|Ensembl:ENSG0000010884...,17q21.33,ATP binding cassette subfamily C member 3,protein-coding,,,,ENSG00000108846
2,9429,ABCG2,ABC15|ABCP|BCRP|BCRP1|BMDP|CD338|CDw338|EST157...,MIM:603756|HGNC:HGNC:74|Ensembl:ENSG0000011877...,4q22.1,ATP binding cassette subfamily G member 2 (Jun...,protein-coding,,,,ENSG00000118777
3,51554,ACKR4,CC-CKR-11|CCBP2|CCR-11|CCR10|CCR11|CCRL1|CCX C...,MIM:606065|HGNC:HGNC:1611|Ensembl:ENSG00000129...,3q22.1,atypical chemokine receptor 4,protein-coding,,,,ENSG00000129048
4,59,ACTA2,AAT6|ACTSA|MYMY5,MIM:102620|HGNC:HGNC:130|Ensembl:ENSG000001077...,10q23.31,"actin, alpha 2, smooth muscle, aorta",protein-coding,,,,ENSG00000107796
...,...,...,...,...,...,...,...,...,...,...,...
1179,23613,ZMYND8,PRKCBP1|PRO2893|RACK7,MIM:615713|HGNC:HGNC:9397|Ensembl:ENSG00000101...,20q13.12,zinc finger MYND-type containing 8,protein-coding,,,,ENSG00000101040
1180,7702,ZNF143,SBF|STAF|pHZ-1,MIM:603433|HGNC:HGNC:12928|Ensembl:ENSG0000016...,11p15.4,zinc finger protein 143,protein-coding,,,,ENSG00000166478
1181,7764,ZNF217,ZABC1,MIM:602967|HGNC:HGNC:13009|Ensembl:ENSG0000017...,20q13.2,zinc finger protein 217,protein-coding,,,,ENSG00000171940
1182,155061,ZNF746,PARIS,MIM:613914|HGNC:HGNC:21948|Ensembl:ENSG0000018...,7q36.1,zinc finger protein 746,protein-coding,,,,ENSG00000181220


In [34]:
# read in kidney genes
kidney_features = pd.read_csv('../figs_xgboost/xgboost_models/features_mapped_to_genes_ensembl.csv', header=None)[0]
kidney_features

0       ENSG00000103479
1       ENSG00000111364
2       ENSG00000128011
3       ENSG00000169962
4       ENSG00000117477
             ...       
5006    ENSG00000275888
5007    ENSG00000249628
5008    ENSG00000173548
5009    ENSG00000177971
5010    ENSG00000179284
Name: 0, Length: 5011, dtype: object

In [35]:
# get overlapping genes
dbemt_overlap = dbemt[dbemt['ensembl_id'].isin(kidney_features)]
dbemt_overlap

Unnamed: 0,GeneID,GeneSymbol,Alias,Links,Cytoband,FullName,GeneType,Unnamed: 7,Unnamed: 8,Unnamed: 9,ensembl_id
7,81,ACTN4,ACTININ-4|FSGS|FSGS1,MIM:604638|HGNC:HGNC:166|Ensembl:ENSG000001304...,19q13.2,actinin alpha 4,protein-coding,,,,ENSG00000130402
13,133,ADM,AM|PAMP,MIM:103275|HGNC:HGNC:259|Ensembl:ENSG000001489...,11p15.4,adrenomedullin,protein-coding,,,,ENSG00000148926
21,84962,AJUBA,JUB,MIM:609066|HGNC:HGNC:20250|Ensembl:ENSG0000012...,14q11.2,ajuba LIM protein,protein-coding,,,,ENSG00000129474
28,238,ALK,CD246|NBLST3,MIM:105590|HGNC:HGNC:427|Ensembl:ENSG000001710...,2p23.2-p23.1,ALK receptor tyrosine kinase,protein-coding,,,,ENSG00000171094
31,283,ANG,ALS9|HEL168|RAA1|RNASE4|RNASE5,MIM:105850|HGNC:HGNC:483|Ensembl:ENSG000002142...,14q11.2,angiogenin,protein-coding,,,,ENSG00000214274
...,...,...,...,...,...,...,...,...,...,...,...
1165,10413,YAP1,COB1|YAP|YAP2|YAP65|YKI,MIM:606608|HGNC:HGNC:16262|Ensembl:ENSG0000013...,11q22.1,Yes associated protein 1,protein-coding,,,,ENSG00000137693
1168,7532,YWHAG,14-3-3GAMMA|PPP1R170,MIM:605356|HGNC:HGNC:12852|Ensembl:ENSG0000017...,7q11.23,tyrosine 3-monooxygenase/tryptophan 5-monooxyg...,protein-coding,,,,ENSG00000170027
1174,9839,ZEB2,HSPC082|SIP-1|SIP1|SMADIP1|ZFHX1B,MIM:605802|HGNC:HGNC:14881|Ensembl:ENSG0000016...,2q22.3,zinc finger E-box binding homeobox 2,protein-coding,,,,ENSG00000169554
1175,100303491,ZEB2-AS1,ZEB2-AS|ZEB2AS|ZEB2NAT,HGNC:HGNC:37149|Ensembl:ENSG00000238057,2q22.3,ZEB2 antisense RNA 1,ncRNA,,,,ENSG00000238057


In [40]:
dbemt_overlap
dbemt_overlap.shape[0] # 294 kidney genes are in dbemt
dbemt_overlap.shape[0]/dbemt.shape[0] # we capture around 25% of dbemt genes
dbemt_overlap.shape[0]/len(kidney_features) # around 5-6% of kidney genes are in dbemt

Unnamed: 0,GeneID,GeneSymbol,Alias,Links,Cytoband,FullName,GeneType,Unnamed: 7,Unnamed: 8,Unnamed: 9,ensembl_id
7,81,ACTN4,ACTININ-4|FSGS|FSGS1,MIM:604638|HGNC:HGNC:166|Ensembl:ENSG000001304...,19q13.2,actinin alpha 4,protein-coding,,,,ENSG00000130402
13,133,ADM,AM|PAMP,MIM:103275|HGNC:HGNC:259|Ensembl:ENSG000001489...,11p15.4,adrenomedullin,protein-coding,,,,ENSG00000148926
21,84962,AJUBA,JUB,MIM:609066|HGNC:HGNC:20250|Ensembl:ENSG0000012...,14q11.2,ajuba LIM protein,protein-coding,,,,ENSG00000129474
28,238,ALK,CD246|NBLST3,MIM:105590|HGNC:HGNC:427|Ensembl:ENSG000001710...,2p23.2-p23.1,ALK receptor tyrosine kinase,protein-coding,,,,ENSG00000171094
31,283,ANG,ALS9|HEL168|RAA1|RNASE4|RNASE5,MIM:105850|HGNC:HGNC:483|Ensembl:ENSG000002142...,14q11.2,angiogenin,protein-coding,,,,ENSG00000214274
...,...,...,...,...,...,...,...,...,...,...,...
1165,10413,YAP1,COB1|YAP|YAP2|YAP65|YKI,MIM:606608|HGNC:HGNC:16262|Ensembl:ENSG0000013...,11q22.1,Yes associated protein 1,protein-coding,,,,ENSG00000137693
1168,7532,YWHAG,14-3-3GAMMA|PPP1R170,MIM:605356|HGNC:HGNC:12852|Ensembl:ENSG0000017...,7q11.23,tyrosine 3-monooxygenase/tryptophan 5-monooxyg...,protein-coding,,,,ENSG00000170027
1174,9839,ZEB2,HSPC082|SIP-1|SIP1|SMADIP1|ZFHX1B,MIM:605802|HGNC:HGNC:14881|Ensembl:ENSG0000016...,2q22.3,zinc finger E-box binding homeobox 2,protein-coding,,,,ENSG00000169554
1175,100303491,ZEB2-AS1,ZEB2-AS|ZEB2AS|ZEB2NAT,HGNC:HGNC:37149|Ensembl:ENSG00000238057,2q22.3,ZEB2 antisense RNA 1,ncRNA,,,,ENSG00000238057


294

0.2483108108108108

0.058670923967272

In [43]:
# now lets do a fishers test

# for this, we need the kidney gene background
kidney_background = pd.read_csv('../figs_xgboost/xgboost_models/input_features_for_background_gene_list_ensembl_ids.csv', header=None)[0]
kidney_background


# now lets make the 2x2 contingency matrix
kidney_features.isin(kidney_background).mean() # check, should be 1

len(kidney_background)
len(kidney_features)

len(dbemt['ensembl_id'])

dbemt_in_bg = dbemt['ensembl_id'][dbemt['ensembl_id'].isin(kidney_background)] # need just parts of cosmic that are in background
len(dbemt_in_bg)

dbemt_overlap['ensembl_id'].isin(kidney_background).mean() # should be 1


import numpy as np
dbemt_not_kidney = dbemt_in_bg[np.logical_not(dbemt_in_bg.isin(kidney_features))]

kidney_not_dbemt = kidney_features[np.logical_not(kidney_features.isin(dbemt_in_bg))]

not_dbemt_not_kidney = kidney_background[np.logical_not((kidney_background.isin(dbemt_in_bg)) | (kidney_background.isin(kidney_features)))]


contingency = pd.DataFrame({
                'dbemt': [len(dbemt_overlap['ensembl_id']), len(dbemt_not_kidney)], 
                'not_dbemt': [len(kidney_not_dbemt), len(not_dbemt_not_kidney)] 
              })
contingency.index = ['kidney', 'not_kidney']
contingency


from scipy.stats import fisher_exact
oddsr, p = fisher_exact(contingency, alternative='greater')
print('Oddsr is ', oddsr, ' and p is ', p)


0        ENSG00000103479
1        ENSG00000168405
2        ENSG00000079691
3        ENSG00000169714
4        ENSG00000111364
              ...       
28193    ENSG00000284585
28194    ENSG00000260121
28195    ENSG00000287982
28196    ENSG00000256695
28197    ENSG00000213065
Name: 0, Length: 28198, dtype: object

1.0

28198

5011

1184

1055

1.0

Unnamed: 0,dbemt,not_dbemt
kidney,294,4717
not_kidney,761,22426


Oddsr is  1.8367439381753643  and p is  1.0319735527032817e-16


In [44]:
# checking contingency maths is right
len(dbemt_in_bg)
294+761

len(kidney_features)
294+4717

len(kidney_background)
294+4717+761+22426

1055

1055

5011

5011

28198

28198

### HCMDB overlap


In [10]:
# read in HCMDB
import pandas as pd
hcmdb = pd.read_csv('Metastatic Genes from HCMDB - Sheet1.csv')
hcmdb = hcmdb['gene'].drop_duplicates().str.upper()
len(hcmdb)
list(hcmdb)


2240

['AAMP',
 'ABCA2',
 'ABCB1',
 'ABCC5',
 'ABCG2',
 'ABI1',
 'ABI2',
 'ABL1',
 'ABL2',
 'ACE',
 'ACE2',
 'ACKR1',
 'ACKR2',
 'ACKR3',
 'ACOT8',
 'ACP5',
 'ACTA2',
 'ACTB',
 'ACTL6A',
 'ACTN4',
 'ACTR2',
 'ACTR3',
 'ACVR1C',
 'ADAM10',
 'ADAM12',
 'ADAM15',
 'ADAM17',
 'ADAM23',
 'ADAM28',
 'ADAM8',
 'ADAM9',
 'ADAMTS1',
 'ADAMTS13',
 'ADAMTS18',
 'ADGRE5',
 'ADGRF5',
 'ADGRG1',
 'ADGRL3',
 'ADIPOQ',
 'ADM',
 'ADORA2B',
 'ADRB2',
 'AFAP1-AS1',
 'AFAP1L1',
 'AFAP1L2',
 'AFDN',
 'AFP',
 'AGER',
 'AGO1',
 'AGO2',
 'AGO4',
 'AGR2',
 'AGR3',
 'AGT',
 'AGTR1',
 'AGTR2',
 'AHR',
 'AIFM1',
 'AJUBA',
 'AKAP12',
 'AKR1C2',
 'AKR1C3',
 'AKT1',
 'AKT2',
 'AKT3',
 'ALCAM',
 'ALDH1A1',
 'ALDH1A3',
 'ALDH3A1',
 'ALDH7A1',
 'ALDOA',
 'ALK',
 'ALOX15',
 'ALOX5',
 'ALPL',
 'ALPP',
 'ALYREF',
 'AMACR',
 'AMBP',
 'AMFR',
 'AMOT',
 'ANGPT1',
 'ANGPT2',
 'ANGPTL2',
 'ANGPTL4',
 'ANGPTL6',
 'ANKRD12',
 'ANO1',
 'ANO9',
 'ANOS1',
 'ANTXR1',
 'ANXA1',
 'ANXA2',
 'ANXA2R',
 'ANXA3',
 'ANXA4',
 'ANXA7',
 'APAF1',
 

In [11]:
# read in kidney genes
kidney_features = pd.read_csv('../figs_xgboost/xgboost_models/features_mapped_to_genes.csv', header=None)[0]
kidney_features = kidney_features.str.upper()
kidney_features


0            RBL2
1           DDX55
2           LRFN1
3          TAS1R3
4         CCDC181
          ...    
4211       CSNK1D
4212    LINC00942
4213        SNX33
4214         IMP3
4215        DAND5
Name: 0, Length: 4216, dtype: object

In [12]:
# get overlapping genes

hcmdb_overlap = hcmdb[hcmdb.isin(kidney_features)]
hcmdb_overlap

len(hcmdb_overlap)
len(hcmdb_overlap)/len(hcmdb) # 21% of hcmdb genes
len(hcmdb_overlap)/len(kidney_features) # 11% of kidney genes


11        ABL1
18       ACKR2
39        ACTB
41       ACTN4
47      ACVR1C
         ...  
6581    ZDHHC2
6601      ZEB2
6614     ZFHX3
6619    ZNF217
6626    ZNF703
Name: gene, Length: 478, dtype: object

478

0.21339285714285713

0.11337760910815939

In [13]:
# now lets do a fishers test

# for this, we need the kidney gene background
kidney_background = pd.read_csv('../figs_xgboost/xgboost_models/input_features_for_background_gene_list_symbols.csv', header=None)[0]
kidney_background = kidney_background.str.upper()
kidney_background


# now lets make the 2x2 contingency matrix

kidney_features.isin(kidney_background).mean() # check, should be 1

len(kidney_background)
len(kidney_features)

len(hcmdb)

hcmdb_in_bg = hcmdb[hcmdb.isin(kidney_background)] # need just parts of cosmic that are in background
len(hcmdb_in_bg)

hcmdb_overlap.isin(kidney_background).mean() # should be 1


import numpy as np
hcmdb_not_kidney = hcmdb_in_bg[np.logical_not(hcmdb_in_bg.isin(kidney_features))]

kidney_not_hcmdb = kidney_features[np.logical_not(kidney_features.isin(hcmdb_in_bg))]

not_hcmdb_not_kidney = kidney_background[np.logical_not((kidney_background.isin(hcmdb_in_bg)) | (kidney_background.isin(kidney_features)))]


contingency = pd.DataFrame({
                'hcmdb': [len(hcmdb_overlap), len(hcmdb_not_kidney)], 
                'not_hcmdb': [len(kidney_not_hcmdb), len(not_hcmdb_not_kidney)] 
              })
contingency.index = ['kidney', 'not_kidney']
contingency

from scipy.stats import fisher_exact
oddsr, p = fisher_exact(contingency, alternative='greater')
print('Oddsr is ', oddsr, ' and p is ', p)



0             RBL2
1            CMAHP
2          CARMIL1
3             CNBP
4            DDX55
           ...    
22005      CCDC200
22006    LINC00463
22007      SUZ12P1
22008    LINC02547
22009      MIR4722
Name: 0, Length: 22010, dtype: object

1.0

22010

4216

2240

1774

1.0

Unnamed: 0,hcmdb,not_hcmdb
kidney,478,3738
not_kidney,1296,16498


Oddsr is  1.627851924512349  and p is  4.141725963571837e-17


In [14]:
# checking contingency maths is right
len(hcmdb_in_bg)
478+1296

len(kidney_features)
478+3738

len(kidney_background)
478+3738+1296+16498

1774

1774

4216

4216

22010

22010

### TFcheckpoint overlap


In [13]:
# read in TFcheckpoint
import pandas as pd
tfcheck = pd.read_csv('TFCheckpoint_download_180515.txt', sep='\t')
tfcheck = tfcheck['gene_symbol'].drop_duplicates().str.upper()
len(tfcheck)
list(tfcheck)


3479

['ELF5',
 'TCF7L2',
 'PLAG1',
 'NKX2-5',
 'NRF1',
 'GATA2',
 'KLF4',
 'SPI1',
 'ESR2',
 'FOS',
 'VDR',
 'HNF1A',
 'EGR1',
 'NFATC2',
 'NHLH1',
 'SRY',
 'FOSL1',
 'FOXA1',
 'CEBPA',
 'RELA',
 'PAX5',
 'NOBOX',
 'GATA1',
 'CEBPB',
 'HIF1A',
 'CRX',
 'CREB1',
 'FLI1',
 'SOX5',
 'FOXC1',
 'EBF1',
 'JUN',
 'NR4A2',
 'NFYA',
 'ELF1',
 'SMAD4',
 'PAX2',
 'RUNX2',
 'FOXP2',
 'SRF',
 'SMAD2',
 'ESR1',
 'GABPA',
 'EHF',
 'BATF',
 'MEIS1',
 'TP63',
 'SOX9',
 'FOXP1',
 'INSM1',
 'MAF',
 'MYOG',
 'FOXA2',
 'NR3C1',
 'KLF5',
 'MYC',
 'FOXH1',
 'TLX1',
 'SOX2',
 'NFE2L2',
 'NKX3-2',
 'HNF4A',
 'SOX10',
 'ARID3A',
 'SP1',
 'USF2',
 'REST',
 'SOX17',
 'CTCF',
 'ATOH1',
 'HINFP',
 'PDX1',
 'TCF3',
 'FOXD1',
 'STAT1',
 'PAX6',
 'EGR2',
 'POU5F1',
 'PBX1',
 'IRF1',
 'NR5A2',
 'BHLHE40',
 'MYB',
 'MYOD1',
 'JUNB',
 'E2F4',
 'BCL6',
 'TP53',
 'SREBF1',
 'HAND1',
 'MEF2C',
 'TFAP2A',
 'PPARG',
 'NKX3-1',
 'HOXA5',
 'BACH1',
 'PAX4',
 'TFAP2C',
 'FOXO1',
 'NR1H2',
 'FOXO3',
 'MEF2A',
 'YY1',
 'STAT5B',
 'HNF1

In [14]:
# read in kidney genes
kidney_features = pd.read_csv('../figs_xgboost/xgboost_models/features_mapped_to_genes.csv', header=None)[0]
kidney_features = kidney_features.str.upper().drop_duplicates()
kidney_features


0            RBL2
1           DDX55
2           LRFN1
3          TAS1R3
4         CCDC181
          ...    
4211       CSNK1D
4212    LINC00942
4213        SNX33
4214         IMP3
4215        DAND5
Name: 0, Length: 4216, dtype: object

In [15]:
# get overlapping genes

tfcheck_overlap = tfcheck[tfcheck.isin(kidney_features)]
tfcheck_overlap

len(tfcheck_overlap)
len(tfcheck_overlap)/len(tfcheck) # 20% of tfcheck genes
len(tfcheck_overlap)/len(kidney_features) # 16% of kidney genes


0         ELF5
1       TCF7L2
5        GATA2
8         ESR2
14       NHLH1
         ...  
3462     ZMYM4
3463     ZMYM5
3469    ZSWIM4
3473      ZHX1
3476      ZXDC
Name: gene_symbol, Length: 701, dtype: object

701

0.20149468237999424

0.16627134724857684

In [16]:
# now lets do a fishers test

# for this, we need the kidney gene background
kidney_background = pd.read_csv('../figs_xgboost/xgboost_models/input_features_for_background_gene_list_symbols.csv', header=None)[0]
kidney_background = kidney_background.str.upper()
kidney_background


# now lets make the 2x2 contingency matrix

kidney_features.isin(kidney_background).mean() # check, should be 1

len(kidney_background)
len(kidney_features)

len(tfcheck)

tfcheck_in_bg = tfcheck[tfcheck.isin(kidney_background)] # need just parts of cosmic that are in background
len(tfcheck_in_bg)

tfcheck_overlap.isin(kidney_background).mean() # should be 1


import numpy as np
tfcheck_not_kidney = tfcheck_in_bg[np.logical_not(tfcheck_in_bg.isin(kidney_features))]

kidney_not_tfcheck = kidney_features[np.logical_not(kidney_features.isin(tfcheck_in_bg))]

not_tfcheck_not_kidney = kidney_background[np.logical_not((kidney_background.isin(tfcheck_in_bg)) | (kidney_background.isin(kidney_features)))]


contingency = pd.DataFrame({
                'tfcheck': [len(tfcheck_overlap), len(tfcheck_not_kidney)], 
                'not_tfcheck': [len(kidney_not_tfcheck), len(not_tfcheck_not_kidney)] 
              })
contingency.index = ['kidney', 'not_kidney']
contingency

from scipy.stats import fisher_exact
oddsr, p = fisher_exact(contingency, alternative='greater')
print('Oddsr is ', oddsr, ' and p is ', p)



0             RBL2
1            CMAHP
2          CARMIL1
3             CNBP
4            DDX55
           ...    
22005      CCDC200
22006    LINC00463
22007      SUZ12P1
22008    LINC02547
22009      MIR4722
Name: 0, Length: 22010, dtype: object

1.0

22010

4216

3479

2840

1.0

Unnamed: 0,tfcheck,not_tfcheck
kidney,701,3515
not_kidney,2139,15655


Oddsr is  1.4596037685282537  and p is  3.583681364893711e-15


In [17]:
# checking contingency maths is right
len(tfcheck_in_bg)
701+2139

len(kidney_features)
701+3515

len(kidney_background)
701+2139+3515+15655

2840

2840

4216

4216

22010

22010

### Turajlic overlap


In [60]:
# read in Turajlic
import pandas as pd
turajlic = pd.read_csv('Turajlic 2018 list RCC driver genes.csv')
turajlic = turajlic['TARGET'].append(turajlic['ALLIAS']).append(turajlic['ALLIAS.1']).dropna()
turajlic = turajlic.str.split(';').explode().str.strip() # when multiple aliases are on the same line, split up
turajlic = turajlic.str.upper().drop_duplicates()
list(turajlic)

['VHL',
 'PBRM1',
 'SETD2',
 'BAP1',
 'KDM5C',
 'MTOR',
 'CSMD3',
 'TP53',
 'PTEN',
 'PIK3CA',
 'DNHD1',
 'ATM',
 'ARID1A',
 'FMN2',
 'SMARCA4',
 'TET2',
 'FBN2',
 'NAV3',
 'KEAP1',
 'M6PR',
 'MET',
 'KMT2C',
 'TCEB1',
 'TNF',
 'RHEB',
 'TSC1',
 'LRRK2',
 'TSC2',
 'STK11',
 'NF2',
 'KMT2D',
 'TNFRSF14',
 'MACF1',
 'COL11A1',
 'OLFML3',
 'VTCN1',
 'RFX5',
 'TCHH',
 'SPTA1',
 'TNFSF18',
 'TNFSF4',
 'SMG7',
 'HMCN1',
 'PTGS2',
 'IL10',
 'USH2A',
 'OBSCN',
 'SPTBN1',
 'VWA3B',
 'LRP2',
 'ADAM23',
 'OBSL1',
 'SPHKAP',
 'KIAA2018',
 'CD80',
 'CD86',
 'SI',
 'WDFY3',
 'FAT1',
 'TRIO',
 'FYB1',
 'MAST4',
 'FAT2',
 'FLT4',
 'ZNF451',
 'MDN1',
 'CCNC',
 'SYNE1',
 'RADIL',
 'IL6',
 'ABCA13',
 'PCLO',
 'AKAP9',
 'TRRAP',
 'MUC17',
 'ZNF800',
 'MGAM',
 'ZFHX4',
 'PKHD1L1',
 'PLEC',
 'CD274',
 'PDCD1LG2',
 'FREM1',
 'CDKN2B',
 'TRPM3',
 'ROR2',
 'TGFBR1',
 'ABCA1',
 'CUBN',
 'PLCE1',
 'MUC5B',
 'PAMR1',
 'TENM4',
 'FAT3',
 'CEP295',
 'NFRKB',
 'CD27',
 'CD4',
 'YBX3',
 'SMARCC2',
 'LRP1',
 'GCN1L1',

In [61]:
# read in kidney genes
kidney_features = pd.read_csv('../figs_xgboost/xgboost_models/features_mapped_to_genes.csv', header=None)[0]
kidney_features = kidney_features.str.upper()
kidney_features


0            RBL2
1           DDX55
2           LRFN1
3          TAS1R3
4         CCDC181
          ...    
4211       CSNK1D
4212    LINC00942
4213        SNX33
4214         IMP3
4215        DAND5
Name: 0, Length: 4216, dtype: object

In [68]:
# get overlapping genes

turajlic_overlap = turajlic[turajlic.isin(kidney_features)]
turajlic_overlap

len(turajlic_overlap)
len(turajlic_overlap)/len(turajlic) # 25% of turajlic genes
len(turajlic_overlap)/len(kidney_features) # 1.3% of kidney genes


5         MTOR
7         TP53
10       DNHD1
14     SMARCA4
16        FBN2
20         MET
24        RHEB
27        TSC2
28       STK11
30       KMT2D
32       MACF1
33     COL11A1
37        TCHH
41        SMG7
43       PTGS2
46       OBSCN
47      SPTBN1
51       OBSL1
52      SPHKAP
54        CD80
55        CD86
58        FAT1
61       MAST4
63        FLT4
68       RADIL
73       TRRAP
79        PLEC
85        ROR2
90       MUC5B
92       TENM4
100       LRP1
109      CNOT1
110     ZNF469
113      STAT3
116     DNAH17
121      TSHZ3
122      TGFB1
126      MEGF6
129       TPM3
133        ALK
134       EML4
136      LRP1B
144      FGFR3
150        NNT
154     HIVEP1
155       CUL7
158        EZR
165     PITRM1
170     KCNMA1
180      AP5M1
182      SYNE2
184    TP53BP1
187     AKAP13
188       VMO1
190        NF1
191      ERBB2
194       CLTC
198       RYR1
199     SPTBN4
dtype: object

59

0.2565217391304348

0.013994307400379507

In [63]:
# now lets do a fishers test

# for this, we need the kidney gene background
kidney_background = pd.read_csv('../figs_xgboost/xgboost_models/input_features_for_background_gene_list_symbols.csv', header=None)[0]
kidney_background = kidney_background.str.upper()
kidney_background


# now lets make the 2x2 contingency matrix

kidney_features.isin(kidney_background).mean() # check, should be 1

len(kidney_background)
len(kidney_features)

len(turajlic)

turajlic_in_bg = turajlic[turajlic.isin(kidney_background)] # need just parts of cosmic that are in background
len(turajlic_in_bg)

turajlic_overlap.isin(kidney_background).mean() # should be 1


import numpy as np
turajlic_not_kidney = turajlic_in_bg[np.logical_not(turajlic_in_bg.isin(kidney_features))]

kidney_not_turajlic = kidney_features[np.logical_not(kidney_features.isin(turajlic_in_bg))]

not_turajlic_not_kidney = kidney_background[np.logical_not((kidney_background.isin(turajlic_in_bg)) | (kidney_background.isin(kidney_features)))]


contingency = pd.DataFrame({
                'turajlic': [len(turajlic_overlap), len(turajlic_not_kidney)], 
                'not_turajlic': [len(kidney_not_turajlic), len(not_turajlic_not_kidney)] 
              })
contingency.index = ['kidney', 'not_kidney']
contingency

from scipy.stats import fisher_exact
oddsr, p = fisher_exact(contingency, alternative='greater')
print('Oddsr is ', oddsr, ' and p is ', p)



0             RBL2
1            CMAHP
2          CARMIL1
3             CNBP
4            DDX55
           ...    
22005      CCDC200
22006    LINC00463
22007      SUZ12P1
22008    LINC02547
22009      MIR4722
Name: 0, Length: 22010, dtype: object

1.0

22010

4216

230

194

1.0

Unnamed: 0,turajlic,not_turajlic
kidney,59,4157
not_kidney,135,17659


Oddsr is  1.85654006183234  and p is  0.00010658052305057398


In [65]:
# checking contingency maths is right
len(turajlic_in_bg)
59+135

len(kidney_features)
59+4157

len(kidney_background)
59+4157+135+17659

194

194

4216

4216

22010

22010

In [None]:
### Turajlic overlap


# read in Turajlic
import pandas as pd
turajlic = pd.read_csv('Turajlic 2018 list RCC driver genes.csv')
turajlic = turajlic['TARGET'].append(turajlic['ALLIAS']).append(turajlic['ALLIAS.1']).dropna()
turajlic = turajlic.str.split(';').explode().str.strip() # when multiple aliases are on the same line, split up
turajlic = turajlic.str.upper().drop_duplicates()
list(turajlic)

# read in kidney genes
kidney_features = pd.read_csv('../figs_xgboost/xgboost_models/features_mapped_to_genes.csv', header=None)[0]
kidney_features = kidney_features.str.upper()
kidney_features


# get overlapping genes

turajlic_overlap = turajlic[turajlic.isin(kidney_features)]
turajlic_overlap

len(turajlic_overlap)
len(turajlic_overlap)/len(turajlic) # 25% of turajlic genes
len(turajlic_overlap)/len(kidney_features) # 1.3% of kidney genes




# now lets do a fishers test

# for this, we need the kidney gene background
kidney_background = pd.read_csv('../figs_xgboost/xgboost_models/input_features_for_background_gene_list_symbols.csv', header=None)[0]
kidney_background = kidney_background.str.upper()
kidney_background


# now lets make the 2x2 contingency matrix

kidney_features.isin(kidney_background).mean() # check, should be 1

len(kidney_background)
len(kidney_features)

len(turajlic)

turajlic_in_bg = turajlic[turajlic.isin(kidney_background)] # need just parts of cosmic that are in background
len(turajlic_in_bg)

turajlic_overlap.isin(kidney_background).mean() # should be 1


import numpy as np
turajlic_not_kidney = turajlic_in_bg[np.logical_not(turajlic_in_bg.isin(kidney_features))]

kidney_not_turajlic = kidney_features[np.logical_not(kidney_features.isin(turajlic_in_bg))]

not_turajlic_not_kidney = kidney_background[np.logical_not((kidney_background.isin(turajlic_in_bg)) | (kidney_background.isin(kidney_features)))]


contingency = pd.DataFrame({
                'turajlic': [len(turajlic_overlap), len(turajlic_not_kidney)], 
                'not_turajlic': [len(kidney_not_turajlic), len(not_turajlic_not_kidney)] 
              })
contingency.index = ['kidney', 'not_kidney']
contingency

from scipy.stats import fisher_exact
oddsr, p = fisher_exact(contingency, alternative='greater')
print('Oddsr is ', oddsr, ' and p is ', p)



# checking contingency maths is right
len(turajlic_in_bg)
59+135

len(kidney_features)
59+4157

len(kidney_background)
59+4157+135+17659

# Testing out four more gene lists

In [49]:
# read in genelist
import pandas as pd
def read_genelist(path):    

    if 'EpiFactor DB.csv' in path:
        genelist = pd.read_csv(path)
        genelist = genelist['HGNC_symbol'].drop_duplicates().str.upper()
        
    if 'KEGG_hsa05211' in path:
        genelist = pd.read_csv(path, header=None)
        genelist = genelist[0].drop_duplicates().str.upper()
        
    if 'pRCC -harmonizome' in path:
        genelist = pd.read_csv(path)
        genelist = genelist['Symbol'].drop_duplicates().str.upper()
        
    if 'RCC harmonizome' in path:
        genelist = pd.read_csv(path)
        genelist = genelist['symbol'].drop_duplicates().str.upper()
        
    print(len(genelist))
    print(list(genelist))
    return(genelist)
    
genelist = read_genelist('Gene Lists - RCC harmonizome -diseases db.csv')

829
['IL2', 'VHL', 'MTOR', 'TFE3', 'CA9', 'IFNA1', 'PRCC', 'FLCN', 'FH', 'KRT7', 'ZNF135', 'DIRC2', 'VEGFA', 'PDPN', 'HIF1A', 'AMACR', 'SRMS', 'CDH6', 'TFEB', 'PBRM1', 'IFNA2', 'PAX8', 'CDH16', 'PAX2', 'TP53', 'MET', 'TSC2', 'TCEB1', 'CUL2', 'TCEB2', 'TPX2', 'EPAS1', 'SETD2', 'CD4', 'CDH1', 'AKT1', 'FLT1', 'EGFR', 'CD34', 'CRP', 'CSF2', 'CD70', 'NCAM1', 'PTEN', 'TPBG', 'MLANA', 'FHIT', 'PCNA', 'IL6', 'ASPSCR1', 'CCND1', 'FGF2', 'HLA-A', 'ABCB1', 'RBX1', 'CD6', 'KRT20', 'S100A1', 'HLA-B', 'TNFSF10', 'RNF139', 'B3GAT1', 'SMUG1', 'ALPP', 'ARPP21', 'ALPPL2', 'ALPL', 'MMP2', 'TNF', 'KDR', 'CLDN8', 'IFNA10', 'EGLN3', 'IL21', 'SDHB', 'PDGFRB', 'HBA1', 'WT1', 'MITF', 'HLA-C', 'PVALB', 'MIR210', 'HBA2', 'KDM5C', 'PARK2', 'TYMP', 'BIRC5', 'RASSF1', 'PTHLH', 'MMP9', 'CA12', 'SLC2A1', 'ZHX2', 'CD44', 'NNMT', 'COL18A1', 'PLIN2', 'SCHIP1', 'IFNA21', 'TGFB1', 'ENO2', 'IL4', 'KANK1', 'HAVCR1', 'ERBB2', 'JADE1', 'IFNA17', 'MUC1', 'LRCH4', 'FLT3', 'CD80', 'CTLA4', 'CD83', 'BAP1', 'SPAG4', 'B2M', 'FASLG'

In [16]:
# read in kidney genes
def read_in_kidney():
    kidney_features = pd.read_csv('../figs_xgboost/xgboost_models/features_mapped_to_genes.csv', header=None)[0]
    kidney_features = kidney_features.str.upper()
    return(kidney_features)

kidney_features = read_in_kidney()
kidney_features


0            RBL2
1           DDX55
2           LRFN1
3          TAS1R3
4         CCDC181
          ...    
4211       CSNK1D
4212    LINC00942
4213        SNX33
4214         IMP3
4215        DAND5
Name: 0, Length: 4216, dtype: object

In [17]:
# get overlapping genes
def get_overlap(genelist, kidney_features):
    genelist_overlap = genelist[genelist.isin(kidney_features)]
    print(genelist_overlap)

    print(len(genelist_overlap))
    print(len(genelist_overlap)/len(genelist))
    print(len(genelist_overlap)/len(kidney_features))
    return(genelist_overlap)

genelist_overlap = get_overlap(genelist, kidney_features)

2        ACTB
9        ADNP
11      AICDA
15     ANP32A
17     ANP32E
        ...  
710    ZNF217
711    ZNF516
712    ZNF532
715    ZNF687
718    ZRANB3
Name: HGNC_symbol, Length: 147, dtype: object
147
0.20416666666666666
0.03486717267552182


In [30]:
# now lets do a fishers test

def fishers(genelist, genelist_overlap, kidney_features):
    # for this, we need the kidney gene background
    kidney_background = pd.read_csv('../figs_xgboost/xgboost_models/input_features_for_background_gene_list_symbols.csv', header=None)[0]
    kidney_background = kidney_background.str.upper()

    # now lets make the 2x2 contingency matrix
    print('Should be 1: ', kidney_features.isin(kidney_background).mean()) # check, should be 1

    genelist_in_bg = genelist[genelist.isin(kidney_background)] # need just parts of cosmic that are in background
    len(genelist_in_bg)

    genelist_overlap.isin(kidney_background).mean() # should be 1


    import numpy as np
    genelist_not_kidney = genelist_in_bg[np.logical_not(genelist_in_bg.isin(kidney_features))]

    kidney_not_genelist = kidney_features[np.logical_not(kidney_features.isin(genelist_in_bg))]

    not_genelist_not_kidney = kidney_background[np.logical_not((kidney_background.isin(genelist_in_bg)) | (kidney_background.isin(kidney_features)))]


    contingency = pd.DataFrame({
                    'genelist': [len(genelist_overlap), len(genelist_not_kidney)], 
                    'not_genelist': [len(kidney_not_genelist), len(not_genelist_not_kidney)] 
                  })
    contingency.index = ['kidney', 'not_kidney']
    print(contingency)

    from scipy.stats import fisher_exact
    oddsr, p = fisher_exact(contingency, alternative='greater')
    print('Oddsr is ', oddsr, ' and p is ', p)

    return contingency, kidney_background, genelist_in_bg

contingency, kidney_background, genelist_in_bg = fishers(genelist, genelist_overlap, kidney_features)



Should be 1:  1.0
            genelist  not_genelist
kidney           147          4069
not_kidney       496         17298
Oddsr is  1.2599225854018186  and p is  0.00985775997962868


In [33]:
# check the contingency matrix
# checking contingency maths is right
def check_conting(contingency, genelist_in_bg, kidney_features, kidney_background):
    genelist_in_bg = genelist[genelist.isin(kidney_background)]
    assert len(genelist_in_bg) == contingency['genelist'].sum()

    assert len(kidney_features) == contingency.loc['kidney'].sum()

    assert len(kidney_background) == contingency.sum().sum()
    
check_conting(contingency, genelist_in_bg, kidney_features, kidney_background) # should not error

In [34]:
# running all gene lists through functions

genelist = read_genelist('Gene Lists - EpiFactor DB.csv')
kidney_features = read_in_kidney()
genelist_overlap = get_overlap(genelist, kidney_features)
contingency, kidney_background, genelist_in_bg = fishers(genelist, genelist_overlap, kidney_features)
check_conting(contingency, genelist_in_bg, kidney_features, kidney_background) # should not error

720
['HDGFL2', 'A1CF', 'ACTB', 'ACTL6A', 'ACTL6B', 'ACTR3B', 'ACTR5', 'ACTR6', 'ACTR8', 'ADNP', 'AEBP2', 'AICDA', 'AIRE', 'ALKBH1', 'ANKRD32', 'ANP32A', 'ANP32B', 'ANP32E', 'APBB1', 'APEX1', 'APOBEC1', 'APOBEC2', 'APOBEC3A', 'APOBEC3B', 'APOBEC3C', 'APOBEC3D', 'APOBEC3F', 'APOBEC3G', 'APOBEC3H', 'ARID1A', 'ARID1B', 'ARID2', 'ARID4A', 'ARID4B', 'ARNTL', 'ARRB1', 'ASF1A', 'ASF1B', 'ASH1L', 'ASH2L', 'ASXL1', 'ASXL2', 'ASXL3', 'ATAD2', 'ATAD2B', 'ATF2', 'ATF7IP', 'ATM', 'ATN1', 'ATR', 'ATRX', 'ATXN7', 'ATXN7L3', 'AURKA', 'AURKB', 'AURKC', 'BABAM1', 'BAHD1', 'BANP', 'BAP1', 'BARD1', 'BAZ1A', 'BAZ1B', 'BAZ2A', 'BAZ2B', 'BCOR', 'BCORL1', 'BMI1', 'BPTF', 'BRCA1', 'BRCA2', 'BRCC3', 'BRD1', 'BRD2', 'BRD3', 'BRD4', 'BRD7', 'BRD8', 'BRD9', 'BRDT', 'BRE', 'BRMS1', 'BRMS1L', 'BRPF1', 'BRPF3', 'BRWD1', 'BRWD3', 'BUB1', 'C11ORF30', 'C14ORF169', 'C17ORF49', 'CARM1', 'CBX1', 'CBX2', 'CBX3', 'CBX4', 'CBX5', 'CBX6', 'CBX7', 'CBX8', 'CCDC101', 'CDC6', 'CDC73', 'CDK1', 'CDK17', 'CDK2', 'CDK3', 'CDK5', 'CDK7

In [40]:
genelist = read_genelist('Gene Lists - KEGG_hsa05211_RCC.csv')
kidney_features = read_in_kidney()
genelist_overlap = get_overlap(genelist, kidney_features)
contingency, kidney_background, genelist_in_bg = fishers(genelist, genelist_overlap, kidney_features)
check_conting(contingency, genelist_in_bg, kidney_features, kidney_background) # should not error
# no significance with RCC KEGG pathway

69
['AKT1', 'AKT2', 'AKT3', 'ARAF', 'ARNT', 'ARNT2', 'BAD', 'BRAF', 'BUB1B-PAK6', 'CDC42', 'CDKN1A', 'CREBBP', 'CRK', 'CRKL', 'CUL2', 'EGLN1', 'EGLN2', 'EGLN3', 'ELOB', 'ELOC', 'EP300', 'EPAS1', 'ETS1', 'FH', 'FLCN', 'GAB1', 'GRB2', 'HGF', 'HIF1A', 'HRAS', 'JUN', 'KRAS', 'MAP2K1', 'MAP2K2', 'MAPK1', 'MAPK3', 'MET', 'NRAS', 'PAK1', 'PAK2', 'PAK3', 'PAK4', 'PAK5', 'PAK6', 'PDGFB', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PRCC', 'PTPN11', 'RAC1', 'RAF1', 'RAP1A', 'RAP1B', 'RAPGEF1', 'RBX1', 'SLC2A1', 'SOS1', 'SOS2', 'TFE3', 'TGFA', 'TGFB1', 'TGFB2', 'TGFB3', 'VEGFA', 'VHL']
6        BAD
9      CDC42
11    CREBBP
21     EPAS1
22      ETS1
26      GRB2
29      HRAS
30       JUN
36       MET
38      PAK1
41      PAK4
47    PIK3CD
49    PIK3R2
50    PIK3R3
52    PTPN11
64     TGFB1
65     TGFB2
Name: 0, dtype: object
17
0.2463768115942029
0.004032258064516129
Should be 1:  1.0
            genelist  not_genelist
kidney            17          4199
not_kidney        49       

In [45]:
genelist = read_genelist('Gene Lists - pRCC -harmonizome -Diseases db.csv')
kidney_features = read_in_kidney()
genelist_overlap = get_overlap(genelist, kidney_features)
contingency, kidney_background, genelist_in_bg = fishers(genelist, genelist_overlap, kidney_features)
check_conting(contingency, genelist_in_bg, kidney_features, kidney_background) # should not error

81
['AMACR', 'PRCC', 'KRT7', 'TFE3', 'FH', 'SCHIP1', 'MET', 'FBXO47', 'CA9', 'CDH16', 'LYPD1', 'PAX8', 'TCEB3', 'FLCN', 'MAD2L2', 'WT1', 'NAPSA', 'PTEN', 'TFEB', 'ASPSCR1', 'BAMBI', 'VIM', 'TFEC', 'B3GAT1', 'NONO', 'PAX2', 'HGF', 'CDC73', 'VHL', 'C1ORF174', 'STK10', 'CLEC2A', 'CPM', 'SLC34A2', 'MUC1', 'PLIN2', 'CDH1', 'RASSF1', 'ACAT1', 'KEAP1', 'S100A1', 'PDPN', 'SEPT2', 'VCL', 'NNMT', 'ELF3', 'ZNF135', 'EGLN2', 'HIF1A', 'CUL3', 'CLDN7', 'AKR1B10', 'SLC2A1', 'BRAF', 'SEPT5', 'PVALB', 'ZNF77', 'KRT20', 'SRMS', 'CDR2', 'KRT8', 'PGR', 'UBE2S', 'MIR210', 'CXCL16', 'FUT4', 'FBXW7', 'AQP3', 'RBPJ', 'MN1', 'HAVCR1', 'RET', 'CD82', 'CALCA', 'AQP1', 'HBA1', 'MMP11', 'TP53', 'IL2', 'HBA2', 'NCAM1']
0       AMACR
6         MET
16      NAPSA
19    ASPSCR1
23     B3GAT1
25       PAX2
30      STK10
37     RASSF1
38      ACAT1
40     S100A1
45       ELF3
50      CLDN7
60       KRT8
61        PGR
62      UBE2S
64     CXCL16
69        MN1
74       AQP1
77       TP53
Name: Symbol, dtype: object
19
0.23

In [50]:
genelist = read_genelist('Gene Lists - RCC harmonizome -diseases db.csv')
kidney_features = read_in_kidney()
genelist_overlap = get_overlap(genelist, kidney_features)
contingency, kidney_background, genelist_in_bg = fishers(genelist, genelist_overlap, kidney_features)
check_conting(contingency, genelist_in_bg, kidney_features, kidney_background) # should not error

829
['IL2', 'VHL', 'MTOR', 'TFE3', 'CA9', 'IFNA1', 'PRCC', 'FLCN', 'FH', 'KRT7', 'ZNF135', 'DIRC2', 'VEGFA', 'PDPN', 'HIF1A', 'AMACR', 'SRMS', 'CDH6', 'TFEB', 'PBRM1', 'IFNA2', 'PAX8', 'CDH16', 'PAX2', 'TP53', 'MET', 'TSC2', 'TCEB1', 'CUL2', 'TCEB2', 'TPX2', 'EPAS1', 'SETD2', 'CD4', 'CDH1', 'AKT1', 'FLT1', 'EGFR', 'CD34', 'CRP', 'CSF2', 'CD70', 'NCAM1', 'PTEN', 'TPBG', 'MLANA', 'FHIT', 'PCNA', 'IL6', 'ASPSCR1', 'CCND1', 'FGF2', 'HLA-A', 'ABCB1', 'RBX1', 'CD6', 'KRT20', 'S100A1', 'HLA-B', 'TNFSF10', 'RNF139', 'B3GAT1', 'SMUG1', 'ALPP', 'ARPP21', 'ALPPL2', 'ALPL', 'MMP2', 'TNF', 'KDR', 'CLDN8', 'IFNA10', 'EGLN3', 'IL21', 'SDHB', 'PDGFRB', 'HBA1', 'WT1', 'MITF', 'HLA-C', 'PVALB', 'MIR210', 'HBA2', 'KDM5C', 'PARK2', 'TYMP', 'BIRC5', 'RASSF1', 'PTHLH', 'MMP9', 'CA12', 'SLC2A1', 'ZHX2', 'CD44', 'NNMT', 'COL18A1', 'PLIN2', 'SCHIP1', 'IFNA21', 'TGFB1', 'ENO2', 'IL4', 'KANK1', 'HAVCR1', 'ERBB2', 'JADE1', 'IFNA17', 'MUC1', 'LRCH4', 'FLT3', 'CD80', 'CTLA4', 'CD83', 'BAP1', 'SPAG4', 'B2M', 'FASLG'