# Environment

In [1]:
import numpy as np
import pandas as pd

In [2]:
!pip install goatools > /dev/null

from goatools.obo_parser import GODag 
from goatools.gosubdag.gosubdag import GoSubDag

from goatools.anno.gaf_reader import GafReader
from goatools.godag.go_tasks import get_go2parents

In [3]:
!git clone https://github.com/sinc-lab/exp2GO.git

Cloning into 'exp2GO'...
remote: Enumerating objects: 155, done.[K
remote: Counting objects: 100% (155/155), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 155 (delta 52), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (155/155), 38.84 MiB | 14.77 MiB/s, done.
Resolving deltas: 100% (52/52), done.


In [4]:
path = 'exp2GO/data/'

In [5]:
species = 'ara' # ara dicty yeast

if species=='ara':
    species_full = 'arabidopsis'
    gafT_1 = '131' # 2016-06-06 (156 full uniprot)
    gafT0 = '138'  # 2017-02-12 (162 full uniprot)
else:
    species_full = species
    gafT_1 = '59'
    gafT0 = '66'

In [6]:
GAF = 'gafT-1/'; gafnumb = gafT_1
#GAF = 'gafT0/'; gafnumb = gafT0

filename = path + 'goa_' + species_full + '.gaf.' + gafnumb + '.gz'
!gunzip $filename

In [7]:
gafname = path + 'goa_' + species_full + '.gaf.' + gafnumb

gaf = GafReader(gafname)
annots = gaf.get_ns2ntsanno()
annots = annots['BP']; subont = 'BP'

HMS:0:00:03.777067 141,051 annotations READ: exp2GO/data/goa_arabidopsis.gaf.131 


In [9]:
OBO = 'oboT-1/'; obo_date = '2016-06-01' # FIXED in T-1, best match with CAFA3 T-1 annotations
# All propagations must be made *with the same OBO*, otherwise changes are incorporated in the annotations due to differences in the parents.

filename = path + 'gene_ontology_edit_'+obo_date+'.obo.zip'
!unzip -o -d $path $filename

godag=GODag(path+'gene_ontology_edit_'+obo_date+'.obo', optional_attrs={'relationship'})

Archive:  exp2GO/data/gene_ontology_edit_2016-06-01.obo.zip
  inflating: exp2GO/data/gene_ontology_edit_2016-06-01.obo  
exp2GO/data/gene_ontology_edit_2016-06-01.obo: fmt(1.2) rel(2016-05-31) 44,636 GO Terms; optional_attrs(relationship)


# List of genes to process

In [10]:
genes = pd.read_csv(path+species+'_expression_dist_cosine.csv.zip', usecols = [0], index_col = 0)
gene_names = list(genes.index)
display(len(gene_names))

if species=='ara':
    display(gene_names[:3])
    for i, gene in enumerate(gene_names):
        gene_names[i] = gene[0] + gene[1:].lower()
    display(gene_names[:3])

1546

['AT2G26510', 'AT2G39795', 'AT2G39780']

['At2g26510', 'At2g39795', 'At2g39780']

# Annotations

In [11]:
# load annotations generated with 02_annotations and filterred with 03_filter_terms
annot = pd.read_csv(path+species+'_terms_gaf'+str(gafnumb)+'_'+subont+'_with_expr_EXP_FILTERED_terms.csv',
                   index_col=0, header=None, dtype=str) # t-1

gene_terms_exp = {}
for gene in annot.index:
    gene_terms_exp[gene] = list(annot.loc[gene].dropna())

# Ancestors

In [12]:
def dict2csv(dt, file_name):
    gene_terms_save = [[gene]+dt[gene] for gene in dt]  # gene name and terms in each row
    maxlen = max(map(len, gene_terms_save))
    gene_terms_array = np.array([terms+['']*(maxlen-len(terms)) for terms in gene_terms_save]) # append blanks to fulfill columns (Tiago)

    np.savetxt(file_name, gene_terms_array, delimiter=',', fmt='%s')

In [13]:
gene_ancestors = {}
for i, gene in enumerate(gene_terms_exp):
  if i % 1e2 == 0: print('--->', i,'/',len(gene_terms_exp))

  if gene_terms_exp[gene] != [] and gene_terms_exp[gene] != ['GO:0008150'] and gene_terms_exp[gene] != ['GO:0005575'] and gene_terms_exp[gene] != ['GO:0003674']: # root BP root CC root MF
    gosubdag_r0 = GoSubDag(gene_terms_exp[gene], godag, prt=None, relationships={'part_of'}) # 'is_a' is included by default
    gene_ancestors[gene] = sorted(set(list(gosubdag_r0.rcntobj.go2parents))) # unique and sort
  else:
    gene_ancestors[gene] = []

---> 0 / 656
---> 100 / 656
---> 200 / 656
---> 300 / 656
---> 400 / 656
---> 500 / 656
---> 600 / 656


In [14]:
dict2csv(gene_ancestors, path+species+'_terms_gaf'+str(gafnumb)+'_obo'+obo_date+'_'+subont+'_with_expr_EXP_FILTERED_terms_anc.csv')