# Environment

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
!pip install goatools > /dev/null

from goatools.obo_parser import GODag 
from goatools.gosubdag.gosubdag import GoSubDag

from goatools.anno.gaf_reader import GafReader
from goatools.godag.go_tasks import get_go2parents

In [3]:
!git clone https://github.com/sinc-lab/exp2GO.git

Cloning into 'exp2GO'...
remote: Enumerating objects: 95, done.[K
remote: Counting objects: 100% (95/95), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 95 (delta 21), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (95/95), done.


In [4]:
path = 'exp2GO/data/'

# Referece ontology and annotations

Ontologies downloaded from: ftp://ftp.geneontology.org/go/ontology-archive/

Annotations files (gaf) ARA: ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/old/ARABIDOPSIS/

Annotations files (gaf) yeast: ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/old/YEAST/

Annotations files (gaf) all: ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/old/UNIPROT/


In [5]:
species = 'ara' # <<<<<<<<<<<<<<<<<<<<<<<<<<<<
#species = 'dicty'
#species = 'yeast'

if species=='ara':
    species_full = 'arabidopsis'
    gafT_1 = '131' # 2016-06-06 (156 full uniprot)
    gafT0 = '138'  # 2017-02-12 (162 full uniprot)
    gafT1 = '147'  # 2017-11-20 (172 full uniprot)
else:
    species_full = species
    gafT_1 = '59'
    gafT0 = '66'
    gafT1 = '75'

In [7]:
#gafnumb = gafT_1
gafnumb = gafT0
#gafnumb = gafT1

filename = path + 'goa_' + species_full + '.gaf.' + gafnumb + '.gz'
!gunzip $filename

In [9]:
gafname = path + 'goa_' + species_full + '.gaf.' + gafnumb

gaf = GafReader(gafname)
annots = gaf.get_ns2ntsanno()
annots = annots['BP']; subont = 'BP'

HMS:0:00:03.717834 150,495 annotations READ: exp2GO/data/goa_arabidopsis.gaf.138 


In [20]:
obo_date = '2016-06-01' # FIXED in T-1, best match with CAFA3 T-1 annotations
# All propagations must be made *with the same OBO*, otherwise changes are incorporated in the annotations due to differences in the parents.

filename = path + 'gene_ontology_edit_'+obo_date+'.obo.zip'
!unzip -o -d $path $filename

godag=GODag(path+'gene_ontology_edit_'+obo_date+'.obo', optional_attrs={'relationship'})

Archive:  exp2GO/data/gene_ontology_edit_2016-06-01.obo.zip
  inflating: exp2GO/data/gene_ontology_edit_2016-06-01.obo  
exp2GO/data/gene_ontology_edit_2016-06-01.obo: fmt(1.2) rel(2016-05-31) 44,636 GO Terms; optional_attrs(relationship)


# List of genes to process

In [22]:
genes = pd.read_csv(path+species+'_expression_dist_cosine.csv.zip', usecols = [0], index_col = 0)
gene_names = list(genes.index)
display(len(gene_names))

if species=='ara':
    display(gene_names[:3])
    for i, gene in enumerate(gene_names):
        gene_names[i] = gene[0] + gene[1:].lower()
    display(gene_names[:3])

1546

['AT2G26510', 'AT2G39795', 'AT2G39780']

['At2g26510', 'At2g39795', 'At2g39780']

# Annotations

In [23]:
def dict2csv(dt, file_name):
    gene_terms_save = [[gene]+dt[gene] for gene in dt]  # gene name and terms in each row
    maxlen = max(map(len, gene_terms_save))
    gene_terms_array = np.array([terms+['']*(maxlen-len(terms)) for terms in gene_terms_save]) # append blanks to fulfill columns (Tiago)

    np.savetxt(file_name, gene_terms_array, delimiter=',', fmt='%s')

In [24]:
experimental = ['EXP','IDA','IPI','IMP','IGI','IEP','TAS','IC']
#experimental = ['EXP']
#experimental = [] # << ignore the experimental evidence code
gene_terms_exp = {}
notann = []
for gene in gene_names:
  gene_found = False
  terms = []
  for gene_anno in annots:
    gene_found = True
    if gene in gene_anno.DB_Synonym:
      if experimental == [] or gene_anno.Evidence_Code in experimental:
        terms.append(gene_anno.GO_ID)
        
  if not gene_found:
    print(gene, 'not found in DB_Synonym')
    
  gene_terms_exp[gene] = list(np.unique(terms))

  if gene_terms_exp[gene] == []: 
#    gene_terms_exp[gene].append('GO:0008150') # add GO root for unannotated genes
    notann.append(gene)

print('Genes not found: {}/{}'.format(len(notann), len(gene_names)))

Genes not found: 890/1546


In [26]:
dict2csv(gene_terms_exp, path+species+'_terms_gaf'+str(gafnumb)+'_'+subont+'_with_expr_EXP.csv')