# GO enrichment tool
Copyright (c) 2017 Pieter Moris, based on code by Pieter Meysman

In [15]:
import pandas as pd
import argparse, re, os

In [16]:
def importGAF(path, geneSet):
    """
    Imports a GAF file (gene association format) and generates a dictionary mapping the gene uniprot AC to the GO ID.
    
    Information on the GAF 2.1 format can be found at 
        http://geneontology.org/page/go-annotation-file-gaf-format-21 
            
    Parameters
    ----------
    path : str
        The path to the file.
    geneSet : set
        A set containing the uniprot AC's of all the genes under consideration (background).

    Returns
    -------
    dict of str
        A dictionary mapping gene uniprot AC's to GO ID's.
        
        
    Possible improvements:
        Check for `is_obsolete` and `replaced_by`, although the replacement term should be in OBO file as an entry.
        
        Check for inclusion in provided geneset afterwards using:
            gafDict = {key: value for key, value in gafDict.items() if key in geneSet }
            
        To do: double dictionary? already filter based on gene list? what to do with NOT?
    """
    
    gafPath = os.path.abspath(path)
    
    if not geneSet:
        print('Empty gene set was provided during creation of GAF dictionary.')
        exit()
    
    with open(gafPath,'r') as gafFile:
        gafDict = {}
        for line in gafFile:
#             if not line.startswith('!'):                    # ignore comments in gaf file
            if line.startswith('UniProtKB'):
                splitLine = line.split('\t')                # split column-wise
                uniprotAC = splitLine[1]
                goTerm = splitLine[4]
                goQualifier = splitLine[3]
                if 'NOT' not in goQualifier:                # ignore annotations with "NOT"
                    if uniprotAC in geneSet:                # only keep genes present in background gene set
                        if uniprotAC not in gafDict:        # Create new key if AC does not already appear in dictionary
                            gafDict[uniprotAC] = {goTerm}   # make dictionary containing uniprot AC as key and set of GO's
                        else:
                            gafDict[uniprotAC].add(goTerm)
                            
    print('Retrieved',len(gafDict),'annotated (background filtered) uniprot AC\'s from',gafPath+'\n')
    
    if len(gafDict) != len(geneSet):
        print('WARNING!\nNot every uniprot AC that was provided in the background set was found in the GAF file:')
        [print(AC) for AC in geneSet if AC not in gafDict]
        print('WARNING!\n')
                            
    return gafDict

In [17]:
# https://stackoverflow.com/questions/1336791/dictionary-vs-object-which-is-more-efficient-and-why

'''
https://stackoverflow.com/questions/3489071/in-python-when-to-use-a-dictionary-list-or-set
When you want to store some values which you'll be iterating over, 
Python's list constructs are slightly faster. 
However, if you'll be storing (unique) values in order to check for their existence, 
then sets are significantly faster.
'''

class goTerm:
    """
    GO term object.
    
    Stores the ID, name and domain of the GO term and contains dictionaries for child and parent nodes.

    Attributes
    ----------
    id : str
        The identifier of the GO term.
    altid : str
        Optional tag for an alternative id.
    name : str
        The GO term name.
    namespace : str
        The domain of the GO term (Cellular Component, Molecular Function or Biological Process).
    parents : set of str
        The parent terms of the GO term, as indicated by the `is_a` relationship.
    childs : set of str
        The child terms of the GO term, derived from other GO terms after a complete OBO file is processed initially.

not necessary... Methods
    -------
    returnID
        Returns the ID of the GO term.
    gamma(n=1.0)
        Change the photo's gamma exposure.  
    """
    
    goCount = 0
    
    __slots__ = ('id', 'name', 'altid', 'namespace', 'childs', 'parents')
    
    def __init__(self, goID):
        self.id = goID
        self.altid = []
        self.name = ''
        self.namespace = ''
        self.childs = set()
        self.parents = set()
        
        goTerm.goCount += 1
    
    def returnID(self):
        return self.id
    

In [18]:
def importOBO(path):
    """
    Imports an OBO file and generates a dictionary containing an OBO object for each GO term.
    
    Parameters
    ----------
    path : str
        The path to the file.

    Returns
    -------
    dict of OBO objects
        Keys of the format `GO-0000001` mapping to OBO objects.
        
        
    Possible improvements:
        Check for `is_obsolete` and `replaced_by`, although the replacement term should be in OBO file as an entry.
    """
    
    GOdict = {}
    
    path = os.path.abspath(path)
    with open(path,'r') as oboFile:
        entryPattern = re.compile('^\[.+\]')             # find re pattern to match '[Entry]'
        validEntry = False
        
        for line in oboFile:
            
            if entryPattern.search(line):                # Only parse entries preceded by [Entry], not [Typedef]
                if 'Term' in line:
                    validEntry = True
                else:
                    validEntry = False
                    
            elif validEntry:                             # if [Entry] was encountered previously, parse annotation
                if line.startswith('id'):                # and hierarchy from subsequent lines
                    goID = line.split(': ')[1].rstrip()  # Store ID for lookup of other attributes in next lines
                    
                    if not goID in GOdict:               # check if id is already stored as a key in dictionary
                        GOdict[goID] = goTerm(goID)      # if not, create new goID object as the value for this key

                elif line.startswith('name'):
                    GOdict[goID].name = line.split(': ')[1].rstrip()
                elif line.startswith('namespace'):
                    GOdict[goID].namespace = line.split(': ')[1].rstrip()
                elif line.startswith('alt_id'):
                    GOdict[goID].altid.append(line.split(': ')[1].rstrip())
                elif line.startswith('is_a'):
                    GOdict[goID].parents.add(line.split()[1].rstrip())
    return GOdict



In [19]:
def importBackground(path):
    """
    Imports the background set of genes (uniprot AC).
    
    Parameters
    ----------
    path : str
        The path to the file.

    Returns
    -------
    set
        A set of background uniprot AC's.
        
    Notes: Gene lists should not contain a header. One gene per line.
    Possible improvement: check for file structure and allow headers, comma separated lists, etc.
    """
    with open(path,'r') as inGenes:
        backgroundSet = set([line.rstrip() for line in inGenes][1:])
    print('Retrieved',len(backgroundSet),'background uniprot AC\'s from',path)
    return backgroundSet

def importSubset(path):
    """
    Imports the gene subset of interest (uniprot AC).
    
    Parameters
    ----------
    path : str
        The path to the file.

    Returns
    -------
    set
        A subset of uniprot ACs of interest.
        
    Notes: Gene lists should not contain a header. One gene per line.
    """
    with open(path,'r') as inGenes:
        geneSubset = set([line.rstrip() for line in inGenes][1:])
    print('Retrieved',len(geneSubset),'subset uniprot AC\'s from',path)
    return geneSubset

In [20]:
# # Import gene list
# inputPath = os.path.abspath('../data/genelistinput.txt')
# with open(inputPath,'r') as inGenes:
#     geneSet = set([line.rstrip() for line in inGenes][1:])

backgroundPath = os.path.abspath('../data/background.txt')
subsetPath = os.path.abspath('../data/missing.txt')

# # Import .obo file
oboPath = os.path.abspath('../data/go_data/gotest.obo')
# with open(oboPath,'r') as oboFile:
#     obo = oboFile.read()

gafPath = os.path.abspath('../data/go_data/goa_human.gaf')

minGenes = 5
threshold = 0.1

background = importBackground(backgroundPath)
interest = importSubset(subsetPath)

gafDict = importGAF(gafPath, background)
gterms = importOBO(oboPath)

Retrieved 146 background uniprot AC's from /media/pieter/DATA/github/ebola-go/data/background.txt
Retrieved 26 subset uniprot AC's from /media/pieter/DATA/github/ebola-go/data/missing.txt
Retrieved 145 annotated (background filtered) uniprot AC's from /media/pieter/DATA/github/ebola-go/data/go_data/goa_human.gaf

Not every uniprot AC that was provided in the background set was found in the GAF file:
F8VVM2



In [21]:
print(gafDict['P84243'])
print(len(gafDict['P84243']))

{'GO:0032200', 'GO:0045815', 'GO:0031047', 'GO:0090230', 'GO:0030307', 'GO:0048477', 'GO:0000228', 'GO:0042692', 'GO:0006997', 'GO:0007566', 'GO:0005654', 'GO:0005634', 'GO:0031508', 'GO:0045814', 'GO:0000784', 'GO:0000979', 'GO:0000980', 'GO:0044267', 'GO:0031509', 'GO:1902340', 'GO:0008584', 'GO:0007596', 'GO:0001649', 'GO:0005576', 'GO:0008283', 'GO:0043234', 'GO:0007286', 'GO:0006334', 'GO:0005515', 'GO:0046982', 'GO:0000788', 'GO:0070062', 'GO:0000183', 'GO:0001740', 'GO:0007338', 'GO:0042393', 'GO:0035264', 'GO:0031492', 'GO:0000786', 'GO:0006336'}
40


# Old version using base case and returning set in each recursion

In [22]:
gterms = importOBO(oboPath)
gterms['GO:0048308'] = goTerm('GO:0048308')
gterms['GO:0048308'].parents.add('GO:0000003')


def buildGOtree(GOdict):
    """
    Generates the entire GO tree by walking through the parent hierarchy of each GO entry.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().

    Returns
    -------
    dict of OBO objects
        Updated GO dict where parent attributes trace back over the full tree hierarchy.
        Keys are of the format `GO-0000001` and map to OBO objects.
    """
    for goID, goObj in GOdict.items():                                 # Process each GO term in the GO dictionary
        # baseParents = GOdict.get(goID).parents
        # if baseParents:                                 # Don't start propagation if GO term has no parents
        #     for parent in baseParents:
        #         propagateTree(parent, goID, GOdict, set())

        # won't work if first order parents have no parents, propagate tree will never reach steps to add these parents

        parentSet = set()
        completeHigherOrderParentSet = propagateTree(goID, goID, GOdict, parentSet)
        goObj.parents.update(completeHigherOrderParentSet)

def propagateTree(currentTerm, basegoID, GOdict, parentSet):
    """
    Propagates through the parent hierarchy of a provided GO term.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().

    Returns
    -------
    dict of OBO objects
        Updated GO dict where parent attributes trace back over the full tree hierarchy.
        Keys are of the format `GO-0000001` and map to OBO objects.
    """

    # If current term is not present in GO dictionary, print warning and end recursion
    if currentTerm not in GOdict:
        print('WARNING!\n' + currentTerm, 'was defined as a parent for',
              basegoID, ', but was not found in the OBO file.') 
        return parentSet
    
    # If current term has no further parents, stop the recursion and return the set
    parents = GOdict[currentTerm].parents
    if not parents:
        return parentSet

    else:
        # Add current term's parents to growing set
        # and recurse function for each
        for parent in parents:           
            parentSet.add(parent)
            return propagateTree(parent, basegoID, GOdict, parentSet)

gterms = importOBO(oboPath)
gterms['GO:0048308'] = goTerm('GO:0048308')
gterms['GO:0048308'].parents.add('GO:0000003')

for i in gterms:
    print('id',i,'parents',gterms[i].parents)

buildGOtree(gterms)

for i in gterms:
    print('id',i,'parents',gterms[i].parents)

id GO:0000002 parents {'GO:0007005'}
id GO:0000003 parents {'GO:0008150'}
id GO:0000001 parents {'GO:0048308', 'GO:0048311'}
id GO:0000005 parents set()
id GO:0048308 parents {'GO:0000003'}
GO:0007005 was defined as a parent for GO:0000002 , but was not found in the OBO file.
GO:0008150 was defined as a parent for GO:0000003 , but was not found in the OBO file.
GO:0008150 was defined as a parent for GO:0000001 , but was not found in the OBO file.
GO:0008150 was defined as a parent for GO:0048308 , but was not found in the OBO file.
id GO:0000002 parents {'GO:0007005'}
id GO:0000003 parents {'GO:0008150'}
id GO:0000001 parents {'GO:0048308', 'GO:0000003', 'GO:0008150', 'GO:0048311'}
id GO:0000005 parents set()
id GO:0048308 parents {'GO:0000003', 'GO:0008150'}


In [26]:
def buildGOtree(GOdict):
    """
    Generates the entire GO tree's parent structure by walking through the parent hierarchy of each GO entry.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().
        Keys are of the format `GO-0000001` and map to OBO objects.

    Returns
    -------
    None
        The input GO dictionary will be updated.
        Parent attributes now trace back over the full tree hierarchy.
    """

    # Process each GO term in the GO dictionary
    for goID, obj in GOdict.items():
        # Define new set to store higher order parents
        parentSet = set()
        # Call helper function to propagate through parents
        propagateTree(goID, goID, GOdict, parentSet)
        # Update GO term's parents attribute to include all higher order
        # parents
        obj.parents.update(parentSet)

    return None


def propagateTree(currentTerm, basegoID, GOdict, parentSet):
    """
    Propagates through the parent hierarchy of a provided GO term.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().

    Returns
    -------
    dict of OBO objects
        Updated GO dict where parent attributes trace back over the full tree hierarchy.
        Keys are of the format `GO-0000001` and map to OBO objects.
    """

    # If current term is not present in GO dictionary, print warning and end recursion
    if currentTerm not in GOdict:
        print('WARNING!\n' + currentTerm, 'was defined as a parent for',
              basegoID, ', but was not found in the OBO file.')
        return

    # If current term has no further parents will end and move
    # back up the stack, since there are no parents to iterate over
    parents = GOdict.get(currentTerm).parents
    for parent in parents:

#         # Check if parent is present in GO dictionary
#         if parent not in GOdict:
#             print('WARNING!\n' + parent, 'was defined as a parent for',
#                   basegoID, ', but was not found in the OBO file.')

        # Add current term's parents to growing set
        parentSet.add(parent)

        # and recurse function for each parent
        propagateTree(parent, basegoID, GOdict, parentSet)
        
        
gterms = importOBO(oboPath)
gterms['GO:0048308'] = goTerm('GO:0048308')
gterms['GO:0048308'].parents.add('GO:0000003')

for i in gterms:
    print('id',i,'parents',gterms[i].parents)

buildGOtree(gterms)

for i in gterms:
    print('id',i,'parents',gterms[i].parents)

id GO:0000002 parents {'GO:0007005'}
id GO:0000003 parents {'GO:0008150'}
id GO:0000001 parents {'GO:0048308', 'GO:0048311'}
id GO:0000005 parents set()
id GO:0048308 parents {'GO:0000003'}
GO:0007005 was defined as a parent for GO:0000002 , but was not found in the OBO file.
GO:0008150 was defined as a parent for GO:0000003 , but was not found in the OBO file.
GO:0008150 was defined as a parent for GO:0000001 , but was not found in the OBO file.
GO:0048311 was defined as a parent for GO:0000001 , but was not found in the OBO file.
GO:0008150 was defined as a parent for GO:0048308 , but was not found in the OBO file.
id GO:0000002 parents {'GO:0007005'}
id GO:0000003 parents {'GO:0008150'}
id GO:0000001 parents {'GO:0048308', 'GO:0008150', 'GO:0000003', 'GO:0048311'}
id GO:0000005 parents set()
id GO:0048308 parents {'GO:0000003', 'GO:0008150'}


In [24]:
s = set()
for k,v in gafDict.items():
    s.update(v)
print(len(s))
s = set()
for k,v in gterms.items():
    s.update(k)
print(len(s))
print(len(gterms))

1271
10
5


In [25]:
def buildGOtree(GOdict, gafDict):
    """
    Generates the entire GO tree's parent structure by walking through the parent hierarchy of each GO entry.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().
        Keys are of the format `GO-0000001` and map to OBO objects.

    Returns
    -------
    None
        The input GO dictionary will be updated.
        Parent attributes now trace back over the full tree hierarchy.
    """
    
    backgroundSetTerms = set()
    for gene, goids in gafDict.items():
        backgroundSetTerms.update(goids)
        
    newGOdict = {}
    
    for goid in backgroundSetTerms:
        parentSet = set()
        propagateTree(goID, goID, GOdict, newGOdict, parentSet)
        # wont work cause only adding base goids, not higher ups
    return newGOdict
    
    
    # Process each GO term in the GO dictionary
    for goID, obj in GOdict.items():
        # Define new set to store higher order parents
        parentSet = set()
        # Call helper function to propagate through parents
        propagateTree(goID, goID, GOdict, parentSet)
        # Update GO term's parents attribute to include all higher order
        # parents
        obj.parents.update(parentSet)

    return None


def propagateTree(currentTerm, basegoID, GOdict, parentSet):
    """
    Propagates through the parent hierarchy of a provided GO term.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().

    Returns
    -------
    dict of OBO objects
        Updated GO dict where parent attributes trace back over the full tree hierarchy.
        Keys are of the format `GO-0000001` and map to OBO objects.
    """

    # If current term is not present in GO dictionary, print warning and end recursion
    if currentTerm not in GOdict:
        print('WARNING!\n' + currentTerm, 'was defined as a parent for',
              basegoID, ', but was not found in the OBO file.')
        return

    # If current term has no further parents will end and move
    # back up the stack, since there are no parents to iterate over
    parents = GOdict.get(currentTerm).parents
    for parent in parents:

#         # Check if parent is present in GO dictionary
#         if parent not in GOdict:
#             print('WARNING!\n' + parent, 'was defined as a parent for',
#                   basegoID, ', but was not found in the OBO file.')

        # Add current term's parents to growing set
        parentSet.add(parent)

        # and recurse function for each parent
        propagateTree(parent, basegoID, GOdict, parentSet)
        
        
gterms = importOBO(oboPath)
gterms['GO:0048308'] = goTerm('GO:0048308')
gterms['GO:0048308'].parents.add('GO:0000003')

for i in gterms:
    print('id',i,'parents',gterms[i].parents)

buildGOtree(gterms)

for i in gterms:
    print('id',i,'parents',gterms[i].parents)

id GO:0000002 parents {'GO:0007005'}
id GO:0000003 parents {'GO:0008150'}
id GO:0000001 parents {'GO:0048308', 'GO:0048311'}
id GO:0000005 parents set()
id GO:0048308 parents {'GO:0000003'}


TypeError: buildGOtree() missing 1 required positional argument: 'gafDict'

In [40]:
# get children

for i in ['GO:0060373', 'GO:0048245', 'GO:0044077', 'GO:0042925', 'GO:1902361', 'GO:1902361', 'GO:1902361', 'GO:0000001', 'GO:0000002']:
    print('id',i,'parents',gterms[i].parents,'children',gterms[i].childs)

for k, v in gterms.items():
    for parent in v.parents:
        gterms[parent].childs.add(k)

print('after')
for i in ['GO:0060373', 'GO:0048245', 'GO:0044077', 'GO:0042925', 'GO:1902361', 'GO:1902361', 'GO:1902361', 'GO:0000001', 'GO:0000002']:
    print('id',i,'parents',gterms[i].parents,'children',gterms[i].childs)

id GO:0060373 parents {'GO:0065007', 'GO:0032844', 'GO:2000021', 'GO:0003254', 'GO:0050789', 'GO:0050794', 'GO:0042391', 'GO:0008150', 'GO:0065008'} children set()
id GO:0048245 parents {'GO:0006935', 'GO:0044699', 'GO:0002376', 'GO:0042330', 'GO:0050900', 'GO:0016477', 'GO:0040011', 'GO:0097529', 'GO:0044763', 'GO:0009987', 'GO:0097530', 'GO:0071621', 'GO:0030595', 'GO:0072677', 'GO:0048870', 'GO:0006928', 'GO:0060326', 'GO:0008150'} children set()
id GO:0044077 parents {'GO:0044068', 'GO:0065007', 'GO:0032879', 'GO:0044003', 'GO:0050789', 'GO:0051049', 'GO:0035821', 'GO:0048259', 'GO:0044419', 'GO:0060627', 'GO:0051128', 'GO:0051817', 'GO:0050794', 'GO:0051704', 'GO:0030100', 'GO:0051701', 'GO:0008150', 'GO:0065008'} children set()
id GO:0042925 parents {'GO:0042910', 'GO:0003674', 'GO:0005215'} children set()
id GO:1902361 parents {'GO:0044699', 'GO:0006810', 'GO:0015711', 'GO:1905039', 'GO:0006820', 'GO:0006848', 'GO:0006850', 'GO:0055085', 'GO:1902578', 'GO:0044765', 'GO:0051179',

In [43]:
print(gterms['GO:0044078'].parents)
for goid in gterms:
    if 'GO:0042925' in gterms[goid].parents:
        print(goid,'isparentof GO:0042925')

{'GO:0044068', 'GO:0065008', 'GO:0065007', 'GO:0045807', 'GO:0044077', 'GO:0044003', 'GO:0032879', 'GO:0050789', 'GO:0051049', 'GO:0035821', 'GO:0051050', 'GO:0044419', 'GO:0048259', 'GO:0060627', 'GO:0051128', 'GO:0048260', 'GO:0048518', 'GO:0051817', 'GO:0048522', 'GO:0050794', 'GO:0051704', 'GO:0030100', 'GO:0051701', 'GO:0008150', 'GO:0051130'}


In [29]:
backgroundPath = os.path.abspath('../data/background.txt')
subsetPath = os.path.abspath('../data/missing.txt')
oboPath = os.path.abspath('../data/go_data/go.obo')
gafPath = os.path.abspath('../data/go_data/goa_human.gaf')

minGenes = 5
threshold = 0.1

background = importBackground(backgroundPath)
interest = importSubset(subsetPath)
gafDict = importGAF(gafPath, background)
gterms = importOBO(oboPath)
print('background\n', background)
print('interest\n', interest)

for i in ['GO:0060373', 'GO:0048245', 'GO:0044077', 'GO:0042925', 'GO:1902361', 'GO:1902361', 'GO:1902361', 'GO:0000001', 'GO:0000002']:
    print('id',i,'parents',gterms[i].parents)

buildGOtree(gterms)

for i in ['GO:0060373', 'GO:0048245', 'GO:0044077', 'GO:0042925', 'GO:1902361', 'GO:1902361', 'GO:1902361', 'GO:0000001', 'GO:0000002']:
    print('id',i,'parents',gterms[i].parents)

Retrieved 146 background uniprot AC's from /media/pieter/DATA/github/ebola-go/data/background.txt
Retrieved 26 subset uniprot AC's from /media/pieter/DATA/github/ebola-go/data/missing.txt
Retrieved 145 annotated (background filtered) uniprot AC's from /media/pieter/DATA/github/ebola-go/data/go_data/goa_human.gaf

Not every uniprot AC that was provided in the background set was found in the GAF file:
F8VVM2

background
 {'P35637', 'Q00325', 'P09651', 'Q96HA1', 'Q92688', 'P68104', 'Q16778', 'Q969J3', 'P10412', 'P31943', 'Q00839', 'O43175', 'P45880', 'O15131', 'Q5QNW6', 'O15269', 'P21796', 'Q71UM5', 'Q8IXQ5', 'O60684', 'P51532', 'O95816', 'Q96B45', 'P05386', 'Q9P258', 'Q12906', 'P31942', 'P39687', 'P49755', 'P60866', 'P12273', 'P57053', 'P51571', 'P17987', 'Q15758', 'Q16531', 'P23527', 'P62879', 'P61978', 'Q96FJ2', 'Q14164', 'P27708', 'Q9UIG0', 'O75569', 'O43791', 'P58876', 'Q6NXT2', 'P0DMV9', 'Q96TA2', 'P22626', 'O95831', 'P04843', 'Q99880', 'P55795', 'P68431', 'P05141', 'Q99878', 'Q9UHD

In [30]:
def completeChildHierarchy(GOdict):
    for GOid, GOobj in GOdict.items():
        [GOdict[parent].childs.add(GOid) for parent in GOobj.parents]
        
for i in ['GO:0060373', 'GO:0048245', 'GO:0044077', 'GO:0042925', 'GO:1902361', 'GO:1902361', 'GO:1902361', 'GO:0000001', 'GO:0000002']:
    print('id',i,'parents',gterms[i].parents,'children',gterms[i].childs)   
    
completeChildHierarchy(gterms)

print('completechildhierarchy')

for i in ['GO:0060373', 'GO:0048245', 'GO:0044077', 'GO:0042925', 'GO:1902361', 'GO:1902361', 'GO:1902361', 'GO:0000001', 'GO:0000002']:
    print('id',i,'parents',gterms[i].parents,'children',gterms[i].childs)

id GO:0060373 parents {'GO:0050789', 'GO:2000021', 'GO:0032844', 'GO:0065008', 'GO:0008150', 'GO:0050794', 'GO:0065007', 'GO:0042391', 'GO:0003254'} children set()
id GO:0048245 parents {'GO:0060326', 'GO:0002376', 'GO:0044763', 'GO:0097529', 'GO:0016477', 'GO:0040011', 'GO:0048870', 'GO:0050900', 'GO:0042330', 'GO:0097530', 'GO:0030595', 'GO:0071621', 'GO:0044699', 'GO:0009987', 'GO:0006928', 'GO:0072677', 'GO:0008150', 'GO:0006935'} children set()
id GO:0044077 parents {'GO:0051817', 'GO:0048259', 'GO:0051128', 'GO:0065008', 'GO:0051704', 'GO:0044003', 'GO:0044068', 'GO:0035821', 'GO:0050789', 'GO:0060627', 'GO:0050794', 'GO:0008150', 'GO:0030100', 'GO:0051049', 'GO:0032879', 'GO:0065007', 'GO:0051701', 'GO:0044419'} children set()
id GO:0042925 parents {'GO:0042910', 'GO:0005215', 'GO:0003674'} children set()
id GO:1902361 parents {'GO:0044765', 'GO:0006839', 'GO:0006850', 'GO:0006810', 'GO:0006820', 'GO:0051179', 'GO:1905039', 'GO:1990542', 'GO:0051234', 'GO:0044699', 'GO:0015849',

In [71]:
print(gafDict['P35637'])

for goid in gafDict['P35637']:
    for goid2 in gafDict['P35637']:
        if goid2 in gterms[goid].childs:
            print(goid2,'is a child of',goid)
            
for goid in gafDict['P35637']:
    for goid2 in gafDict['P35637']:
        if goid2 in gterms[goid].parents:
            print(goid2,'is a parent of',goid)
            
for goid in gafDict['P35637']:
    if not gterms[goid].childs:
        print(goid)

{'GO:0008270', 'GO:0035255', 'GO:1903506', 'GO:0000166', 'GO:0003677', 'GO:0005654', 'GO:0005634', 'GO:0005844', 'GO:0031489', 'GO:0042802', 'GO:0044822', 'GO:0071277', 'GO:0046965', 'GO:0003713', 'GO:0030331', 'GO:0044327', 'GO:0005515', 'GO:0046966', 'GO:0003723', 'GO:0048471', 'GO:0000398', 'GO:0043204'}
GO:0035255 is a child of GO:0005515
GO:0031489 is a child of GO:0005515
GO:0042802 is a child of GO:0005515
GO:0046965 is a child of GO:0005515
GO:0030331 is a child of GO:0005515
GO:0046966 is a child of GO:0005515
GO:0044822 is a child of GO:0003723
GO:0005515 is a parent of GO:0035255
GO:0005515 is a parent of GO:0031489
GO:0005515 is a parent of GO:0042802
GO:0003723 is a parent of GO:0044822
GO:0005515 is a parent of GO:0046965
GO:0005515 is a parent of GO:0030331
GO:0005515 is a parent of GO:0046966
GO:0008270
GO:0035255
GO:0005654
GO:0005844
GO:0031489
GO:0071277
GO:0046965
GO:0030331
GO:0044327
GO:0046966
GO:0043204


In [84]:
print(gafDict)

{'P35637': {'GO:0008270', 'GO:0035255', 'GO:1903506', 'GO:0000166', 'GO:0003677', 'GO:0005654', 'GO:0005634', 'GO:0005844', 'GO:0031489', 'GO:0042802', 'GO:0044822', 'GO:0071277', 'GO:0046965', 'GO:0003713', 'GO:0030331', 'GO:0044327', 'GO:0005515', 'GO:0046966', 'GO:0003723', 'GO:0048471', 'GO:0000398', 'GO:0043204'}, 'Q7L7L0': {'GO:0046982', 'GO:0006337', 'GO:0000788', 'GO:0070914', 'GO:0006342', 'GO:0070062', 'GO:0003677'}, 'P23396': {'GO:0010628', 'GO:0044390', 'GO:0005925', 'GO:0006614', 'GO:1902546', 'GO:0051018', 'GO:0030529', 'GO:0005829', 'GO:0032183', 'GO:0015631', 'GO:0031397', 'GO:0005743', 'GO:0003684', 'GO:0006364', 'GO:0019083', 'GO:0008017', 'GO:0019899', 'GO:0051879', 'GO:2001272', 'GO:0003677', 'GO:0005654', 'GO:0005634', 'GO:0005844', 'GO:0008134', 'GO:0006355', 'GO:0031012', 'GO:0042769', 'GO:0044822', 'GO:0032587', 'GO:0072686', 'GO:0070181', 'GO:0045739', 'GO:0005840', 'GO:0016020', 'GO:0032357', 'GO:0032079', 'GO:0003735', 'GO:2001235', 'GO:1902231', 'GO:0006351'

In [83]:
baseGOids = {gene:set() for gene in gafDict}
for gene, GOids in gafDict.items():
    for GOid in GOids:
        if not gterms[GOid].childs:
            baseGOids[gene].add(GOid)
print(baseGOids)

{'P35637': {'GO:0005654', 'GO:0030331', 'GO:0008270', 'GO:0005844', 'GO:0031489', 'GO:0044327', 'GO:0035255', 'GO:0046966', 'GO:0071277', 'GO:0046965', 'GO:0043204'}, 'Q7L7L0': {'GO:0046982', 'GO:0000788', 'GO:0070914', 'GO:0070062'}, 'P23396': {'GO:0005925', 'GO:0006614', 'GO:0005743', 'GO:0051879', 'GO:0005654', 'GO:0005844', 'GO:0032587', 'GO:0072686', 'GO:0070181', 'GO:0032357', 'GO:0032079', 'GO:0003735', 'GO:0070301', 'GO:0097100', 'GO:0005759', 'GO:0070062', 'GO:1905053', 'GO:0005730', 'GO:0061481', 'GO:0032358', 'GO:0030544', 'GO:0022627'}, 'P04844': {'GO:0004579', 'GO:0005791'}, 'P09651': {'GO:0005654', 'GO:0098505', 'GO:0061752', 'GO:0032212', 'GO:1903936', 'GO:0070062', 'GO:0032211'}, 'Q96HA1': {'GO:0007077', 'GO:0005635', 'GO:0017056', 'GO:0005643', 'GO:0008139'}, 'Q93008': {'GO:0004843', 'GO:0070410'}, 'Q9Y4I1': {'GO:0005516', 'GO:0032593', 'GO:0005794', 'GO:0070062', 'GO:0005524', 'GO:0032402', 'GO:0032869', 'GO:0042470', 'GO:0032433', 'GO:0001726', 'GO:0001750', 'GO:0017

In [92]:
baseGOids = [GOid for gene, GOids in gafDict.items() for GOid in GOids if not gterms[GOid].childs ]

baseGOids2 = []
for gene, GOids in gafDict.items():
    for GOid in GOids:
        if not gterms[GOid].childs:
            baseGOids2.append(GOid)
            
print(baseGOids)
print(baseGOids2)
baseGOids == baseGOids2

['GO:0008270', 'GO:0035255', 'GO:0005654', 'GO:0005844', 'GO:0031489', 'GO:0071277', 'GO:0046965', 'GO:0030331', 'GO:0044327', 'GO:0046966', 'GO:0043204', 'GO:0046982', 'GO:0000788', 'GO:0070914', 'GO:0070062', 'GO:0005925', 'GO:0006614', 'GO:0005743', 'GO:0051879', 'GO:0005654', 'GO:0005844', 'GO:0032587', 'GO:0072686', 'GO:0070181', 'GO:0032357', 'GO:0032079', 'GO:0003735', 'GO:0070301', 'GO:0097100', 'GO:0005759', 'GO:0070062', 'GO:1905053', 'GO:0005730', 'GO:0061481', 'GO:0032358', 'GO:0030544', 'GO:0022627', 'GO:0005654', 'GO:0046982', 'GO:0000788', 'GO:0070062', 'GO:0004843', 'GO:0070410', 'GO:0004579', 'GO:0005791', 'GO:0032211', 'GO:0005654', 'GO:1903936', 'GO:0098505', 'GO:0061752', 'GO:0032212', 'GO:0070062', 'GO:0008139', 'GO:0007077', 'GO:0005635', 'GO:0017056', 'GO:0005643', 'GO:0043209', 'GO:0098574', 'GO:0032587', 'GO:0005853', 'GO:0005525', 'GO:0005615', 'GO:0071364', 'GO:0003746', 'GO:0070062', 'GO:0005730', 'GO:0042470', 'GO:0017137', 'GO:0032402', 'GO:0032869', 'GO:0

True

In [74]:
print(gafDict['Q99879'])
print(gterms['GO:0006334'].parents)

for goid in gterms['GO:0006334'].parents:
    if goid not in gafDict['Q99879']:
        print(goid)

{'GO:0005654', 'GO:0046982', 'GO:0005634', 'GO:0000788', 'GO:0005737', 'GO:0070062', 'GO:0006334', 'GO:0000786', 'GO:0003677'}
{'GO:0009987', 'GO:0043933', 'GO:0034728', 'GO:0016043', 'GO:0008150', 'GO:0065003', 'GO:0065004', 'GO:0022607', 'GO:0071824', 'GO:0006325', 'GO:0034622', 'GO:0071840'}
GO:0009987
GO:0043933
GO:0034728
GO:0016043
GO:0008150
GO:0065003
GO:0065004
GO:0022607
GO:0071824
GO:0006325
GO:0034622
GO:0071840


In [14]:
for goid in gafDict['Q99879']:
    if goid not in gterms['GO:0006334'].parents:
        print(goid)

GO:0005654
GO:0006334
GO:0070062
GO:0000786
GO:0003677
GO:0005737
GO:0000788
GO:0046982
GO:0005634


In [25]:
for goid, goobj in gterms.items():
    if 'GO:0003254' in goobj.parents:
        print('isparent',goid)
print('done')

isparent GO:1900826
isparent GO:1904180
isparent GO:1905029
isparent GO:1905027
isparent GO:0060371
isparent GO:1905028
isparent GO:0051901
isparent GO:1904198
isparent GO:0051900
isparent GO:1900827
isparent GO:0060373
isparent GO:1900825
isparent GO:1904181
isparent GO:1904199
isparent GO:0051902
isparent GO:0098902
done


In [93]:
a = {}
print(type(a))

<class 'dict'>


In [26]:
for gene in gafDict:
    if 'GO:0003254' in gafDict[gene]:
        print(gene)

In [57]:
b = set(['a','b','c'])
s = set(['a'])
s.issubset(b)

True

In [156]:
gterms2 = importOBO(oboPath)

In [162]:
print(gterms2['GO:0006334'].parents)


{'GO:0065004', 'GO:0034728'}


In [None]:
def enrichmentOneSided(GOid, background, subset, GOdict, gafDict):
    """
    Performs a one-sided hypergeometric test for a given GO term.

    Parameters
    ----------
    GOid : str
        A GO identifier (key to the GO dictionary).
    background : set of str
        A set of background uniprot AC's.
    subset : set of str
        A subset of uniprot AC's of interest.
    GOdict : dict
        A dictionary of GO objects generated by importOBO().
        Keys are of the format `GO-0000001` and map to OBO objects.

    Returns
    -------
    float
        The p-value of the one-sided hypergeometric test.
    """

    backgroundTotal = len(background)
    subsetTotal = len(subset)
    
    backgroundGO = countGOassociations(GOid, background, GOdict)
    subsetGO = countGOassociations(GOid, subset, GOdict)
    
    # k or more successes (= GO associations = subsetGO) in N draws (= subsetTotal)
    # from a population of size M (backgroundTotal) containing n successes (backgroundGO)
    # k or more is the sum of the probability mass functions of k up to N successes
    # since cdf gives the cumulative probability up and including input (less or equal to k successes),
    # and we want P(k or more), we need to calculate 1 - P(less than k) =  1 - P(k-1 or less)
    # .sf is the survival function (1-cdf).
    pVal = hypergeom.sf(subsetGO-1, backgroundTotal, backgroundGO, subsetTotal)
    
    if pVal > args.threshold:
        pVal = recurseEnrichmentOneSided(GOid, background, subset, GOdict)
    
    return pVal

def countGOassociations(GOid, backgroundSet, GOdict, gafDict):
    """
    """
    
    GOcounter = 0
    
    validTerms = set(GOid)
    validTerms.update(GOdict['GOid'].childs)
    
    for gene in backgroundSet:
        if GOid in gafDict
        
        

def enrichmentTwoSided(currentTerm, basegoID, GOdict, parentSet):
    """
    Propagates through the parent hierarchy of a provided GO term.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().

    Returns
    -------
    dict of OBO objects
        Updated GO dict where parent attributes trace back over the full tree hierarchy.
        Keys are of the format `GO-0000001` and map to OBO objects.
    """

probability of drawing 4 or more white balls from the urn in 10 draws, from a total of 5 white and 45 black balls.

In [7]:
from scipy.stats import hypergeom
hypergeom.sf(3, 50, 5, 10, loc=0)


0.0040835205497556553

In [152]:
1-hypergeom.cdf(3, 50, 5, 10, loc=0)

0.0040835205497192772

In [153]:
hypergeom.sf(4, 50, 5, 10, loc=0)

0.00011893749174045793

In [146]:
from scipy.special import comb
( comb(5, 4, exact=True) * comb(45, 6, exact=True) ) / (comb(50, 10, exact=True)) +  ( 
    comb(5, 5, exact=True) * comb(45, 5, exact=True) ) / (comb(50, 10, exact=True))

0.004083520549755517

In [141]:
hypergeom.pmf(4,50,5,10) + hypergeom.pmf(5,50,5,10)

0.0040835205497556553

hypergeometric test function first

In [169]:
bgCount = 0
for gene in background:
    if gene in gafDict:
        if 'GO:0006334' in gafDict.get(gene):
            bgCount += 1
print(bgCount)
subCount = 0
for gene in interest:
    if gene in gafDict:
        if 'GO:0006334' in gafDict.get(gene):
            subCount += 1
print(subCount)



23
5


In [188]:
baseGOids = set()
for gene, goidSet in gafDict.items():
    baseGOids.update(goidSet)
    
print(len(baseGOids))

l=[]
for gene, goidSet in gafDict.items():
    l.extend(list(goidSet))
print(len(l))
print(len(set(l)))

1271
3117
1271


In [1]:
print(type(background))
print(background)
print(gafDict.keys())
for i in gafDict:
    if i not in background:
        print(i)

NameError: name 'background' is not defined

In [179]:
for i in background:
    if i not in gafDict:
        print(i)

F8VVM2


In [None]:

def buildGOtree(GOdict):
    """
    Generates the entire GO tree's parent structure by walking through the parent hierarchy of each GO entry.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().
        Keys are of the format `GO-0000001` and map to OBO objects.

    Returns
    -------
    None
        The input GO dictionary will be updated.
        Parent attributes now trace back over the full tree hierarchy.
    """

    # Process each GO term in the GO dictionary
    for goID, obj in GOdict.items():
        # Define new set to store higher order parents
        parentSet = set()
        # Call helper function to propagate through parents
        completeHigherOrderParentSet = propagateTree(goID, goID, GOdict, parentSet)
        # Update GO term's parents attribute to include all higher order parents
        obj.parents.update(parentSet)

    return None

def propagateTree(currentTerm, basegoID, GOdict, parentSet):
    """
    Propagates through the parent hierarchy of a provided GO term.

    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().

    Returns
    -------
    dict of OBO objects
        Updated GO dict where parent attributes trace back over the full tree hierarchy.
        Keys are of the format `GO-0000001` and map to OBO objects.
    """

    # If current term has no further parents will end and move
    # back up the stack, since there are no parents to iterate over
    parents = GOdict.get(currentTerm).parents
    for parent in parents:

        # Check if parent is present in GO dictionary
        if parent not in GOdict:
            print('WARNING!\n' + parent, 'was defined as a parent for',
                  basegoID, ', but was not found in the OBO file.')

        # Add current term's parents to growing set
        parentSet.add(parent)

        # and recurse function for each parent
        propagateTree(parent, basegoID, GOdict, parentSet)


In [130]:
# # Import .gaf file
# gafPath = os.path.abspath('../data/go_data/goa_human.gaf')
# with open(gafPath,'r') as gafFile:
#     gafDict = {}
#     for line in gafFile:
#         if not line.startswith('!'):                    # ignore comments in gaf file
#             splitLine = line.split('\t')                # split column-wise
#             uniprotAC = splitLine[1]
#             goTerm = splitLine[4]
#             goQualifier = splitLine[3]
#             if not 'NOT' in goQualifier:                # ignore annotations with "NOT"
#                 if uniprotAC in geneSet:                # only keep genes present in input gene set
#                     if not uniprotAC in gafDict:        # Create new key if AC does not already appear in dictionary
#                         gafDict[uniprotAC] = {goTerm}   # make dictionary containing uniprot AC as key and set of GO's
#                     else:
#                         gafDict[uniprotAC].add(goTerm)
# print(gafDict['O43175'])

In [131]:
# # Gaf pandas
# gafDf = pd.read_csv(gafPath, sep='\t', header=None, comment='!')
# gafDf = gafDf[gafDf[1].isin(geneSet)]    # only keep genes present in input gene set
# gafDf = gafDf[~gafDf[3].str.contains('NOT', na=False)]    # ignore annotations with "NOT"
# gafDf = gafDf.reset_index(drop=True)
# print(gafDf)

In [None]:
print(gterms)
gterms['GO:0000001'].id
print(gterms['GO:0000001'].parents)
gterms['GO:0000001'].childs

In [124]:
gterms = importOBO(oboPath)

def buildGOtree(GOdict):
    """
    Generates the entire GO tree by walking through the parent hierarchy.
    
    Parameters
    ----------
    GOdict : dict
        A dictionary of GO objects generated by importOBO().

    Returns
    -------
    dict of OBO objects
        Child and parent attributes are completed.
        Keys of the format `GO-0000001` mapping to OBO objects.
    """
    for goID in GOdict:
        
        # recursive call starts here
        
        for parent in GOdict[goID].parents:
            
            propagateTree(parent, goID, GOdict)
            
def propagateTree(parent, goID, GOdict):
    if goID not in GOdict[parent].child:
        GOdict[parent].child.add(goID)
        
    if GOdict[parent]:
        for parentsParent in GOdict[parent].parents: 
            propagateTree(parentsParent, goID, GOdict)
    else:
        return False

In [127]:
for goID in gterms:
    for parent in gterms[goID].parents:
        print(parent)
        
gterms['GO:0048308'].child

GO:0048308
GO:0048311
GO:0007005
GO:0008150


KeyError: 'GO:0048308'

In [None]:
from timeit import timeit
import re

def find(string, text):
    if string.find(text) > -1:
        pass

def re_find(string, text):
    if re.match(text, string):
        pass

def best_find(string, text):
    if text in string:
        pass

def third_find(string, text):
    if text.startswith(string):
        pass    

print(timeit("find(string, text)", "from __main__ import find; string='lookforme'; text='look'"))
print(timeit("re_find(string, text)", "from __main__ import re_find; string='lookforme'; text='look'")) 
print(timeit("best_find(string, text)", "from __main__ import best_find; string='lookforme'; text='look'")) 
print(timeit("third_find(string, text)", "from __main__ import third_find; string='lookforme'; text='look'")) 



In [None]:
# https://techoverflow.net/blog/2013/11/18/a-geneontology-obo-v1.2-parser-in-python/
# Import .obo file
oboPath = os.path.abspath('../data/go_data/go.obo')
# with open(oboPath,'r') as oboFile:
#     for line in oboFile:


def processGOTerm(goTerm):
    """
    In an object representing a GO term, replace single-element lists with
    their only member.
    Returns the modified object as a dictionary.
    """
    ret = dict(goTerm) #Input is a defaultdict, might express unexpected behaviour
    for key, value in ret.iteritems():
        if len(value) == 1:
            ret[key] = value[0]
    return ret

def parseGOOBO(filename):
    """
    Parses a Gene Ontology dump in OBO v1.2 format.
    Yields each 
    Keyword arguments:
        filename: The filename to read
    """
    with open(filename, "r") as infile:
        currentGOTerm = None
        for line in infile:
            line = line.strip()
            if not line: continue #Skip empty
            if line == "[Term]":
                if currentGOTerm: yield processGOTerm(currentGOTerm)
                currentGOTerm = defaultdict(list)
            elif line == "[Typedef]":
                #Skip [Typedef sections]
                currentGOTerm = None
            else: #Not [Term]
                #Only process if we're inside a [Term] environment
                if currentGOTerm is None: continue
                key, sep, val = line.partition(":")
                currentGOTerm[key].append(val.strip())
        #Add last term
        if currentGOTerm is not None:
            yield processGOTerm(currentGOTerm)

In [None]:
for goTerm in parseGOOBO(oboPath):
    termCounter += 1
