In [2]:
import argparse

import enrichment_stats
import gaf_parser
import genelist_importer
import obo_tools

In [3]:

background = genelist_importer.importBackground('../data/background.txt')
subset = genelist_importer.importSubset('../data/missing.txt')
if not genelist_importer.isValidSubset(subset, background):
    print("WARNING! Subset contains genes not present in background set!")
    print(subset - background)
    print('Terminating script.')
    exit()

gafDict = gaf_parser.importGAF('../data/go_data/goa_human.gaf', background)
gafSubset = gaf_parser.createSubsetGafDict(subset, gafDict)

GOterms = obo_tools.importOBO('../data/go_data/go.obo')

print('background\n', background)
print('interest\n', subset)

for i in ['GO:0060373', 'GO:0048245', 'GO:0044077', 'GO:0042925', 'GO:1902361', 'GO:1902361', 'GO:1902361', 'GO:0000001', 'GO:0000002']:
    print('id', i, 'parents', GOterms[i].parents)

obo_tools.buildGOtree(GOterms)

for i in ['GO:0060373', 'GO:0048245', 'GO:0044077', 'GO:0042925', 'GO:1902361', 'GO:1902361', 'GO:1902361', 'GO:0000001', 'GO:0000002']:
    print('id', i, 'parents', GOterms[i].parents)

Retrieved 146 background uniprot AC's from ../data/background.txt
Retrieved 26 subset uniprot AC's from ../data/missing.txt
Retrieved 145 annotated (background filtered) uniprot AC's from /media/pieter/DATA/github/ebola-go/data/go_data/goa_human.gaf

Not every uniprot AC that was provided in the background set was found in the GAF file:
['F8VVM2']

Not every uniprot AC that was provided in the gene subset was found in the GAF file:
['F8VVM2']

background
 {'P42677', 'P11142', 'Q9BTM1', 'P52597', 'O60814', 'F8VVM2', 'O43791', 'Q96CS3', 'P35268', 'P14866', 'P39687', 'P0C0S8', 'P51532', 'P05388', 'P10412', 'P27708', 'P31943', 'Q9UHD2', 'Q6NXT2', 'Q93008', 'Q14974', 'P31689', 'Q96KK5', 'Q08211', 'Q96AX9', 'Q99832', 'O75569', 'Q93077', 'P08238', 'Q15907', 'Q99877', 'P35637', 'P31942', 'Q5VTE0', 'P49755', 'P52294', 'Q93079', 'Q7L7L0', 'P04844', 'Q96A08', 'P45880', 'Q71U36', 'O00264', 'Q71UM5', 'Q15758', 'Q9UFW8', 'P33778', 'P68104', 'Q16695', 'P17987', 'P53992', 'P62807', 'Q96TA2', 'P68431',

In [4]:
pValues = enrichment_stats.enrichmentAnalysis(background, subset, GOterms, gafDict, gafSubset,
                       minGenes=0, threshold=-1)
len(pValues)

684

In [10]:
pValues

{'GO:0000028': 0.44725240119670062,
 'GO:0000050': 0.1780821917808165,
 'GO:0000075': 0.1780821917808165,
 'GO:0000077': 0.1780821917808165,
 'GO:0000118': 0.32546055739255081,
 'GO:0000122': 0.1418133386100433,
 'GO:0000166': 0.16177815565704054,
 'GO:0000723': 0.54775196461546527,
 'GO:0000786': 0.52841755972864601,
 'GO:0000788': 0.495013507141427,
 'GO:0000975': 0.30747413893775749,
 'GO:0000976': 0.15157153474836732,
 'GO:0000977': 0.15157153474836732,
 'GO:0000980': 0.069708940897624458,
 'GO:0001012': 0.15157153474836732,
 'GO:0001013': 0.1780821917808165,
 'GO:0001046': 0.32546055739255081,
 'GO:0001047': 0.44725240119670062,
 'GO:0001067': 0.30747413893775749,
 'GO:0001085': 0.44725240119670062,
 'GO:0001103': 0.1780821917808165,
 'GO:0001158': 0.069708940897624458,
 'GO:0001163': 0.1780821917808165,
 'GO:0001164': 0.1780821917808165,
 'GO:0001618': 0.32546055739255081,
 'GO:0001882': 0.17193812373647671,
 'GO:0001883': 0.17193812373647671,
 'GO:0001959': 0.32546055739255081,


In [5]:
import numpy as np
keys = np.array(list(pValues.keys()))
pvalues = np.array(list(pValues.values()))

In [8]:
import statsmodels.sandbox.stats.multicomp
fdr = statsmodels.sandbox.stats.multicomp.multipletests(pvalues)

In [14]:
output = np.column_stack((keys.flatten(),pvalues.flatten(),fdr[1].flatten()))
output

array([['GO:1903047', '0.7555209790207154', '1.0'],
       ['GO:0008201', '0.1780821917808165', '1.0'],
       ['GO:0044773', '0.1780821917808165', '1.0'],
       ..., 
       ['GO:0005635', '0.6305579429253946', '1.0'],
       ['GO:0070424', '0.1780821917808165', '1.0'],
       ['GO:0030331', '0.3254605573925508', '1.0']], 
      dtype='<U32')

In [52]:
backgroundTotal = len(background)
subsetTotal = len(subset)

baseGOids = [GOid for gene, GOids in gafSubset.items() for GOid in GOids if not GOterms[GOid].childs]


In [53]:
def finder(goid, smallGOterms):
    if not goid in smallGOterms:
        smallGOterms[goid] = GOterms[goid]
        for goid in GOterms[goid].parents:
            finder(goid, smallGOterms)
    
    
smallGOterms = {}
for goid in baseGOids:
    finder(goid, smallGOterms)



In [54]:
len(smallGOterms)

684

In [80]:
baseParents = set([parent for goid in baseGOids for parent in GOterms[goid].parents])
len(baseParents)


574

In [83]:
baseParents.update(set(baseGOids))
set(baseGOids).isdisjoint(baseParents)

False

In [84]:
len(baseParents)

684

In [61]:
smallGOterms.keys() - baseParents
'''{'GO:0000075',
 'GO:0000077',
 'GO:0000118',
 'GO:0000122','''
smallGOterms['GO:0000075'].parents

{'GO:0008150',
 'GO:0009987',
 'GO:0045786',
 'GO:0048519',
 'GO:0048523',
 'GO:0050789',
 'GO:0050794',
 'GO:0051726',
 'GO:0065007'}

In [57]:
altGOterms = {}

for goid in GOterms:
    if not goid in altGOterms:
        for basegoid in baseGOids:
            if goid


SyntaxError: invalid syntax (<ipython-input-57-486a4246d069>, line 6)