# Get gene semantic distance

### Imports, drive, directories

In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [3]:
!pip install goatools > /dev/null

from goatools.obo_parser import GODag
from goatools.associations import read_gaf
from goatools.semantic import TermCounts, deepest_common_ancestor

In [4]:
!git clone https://github.com/sinc-lab/exp2GO.git

Cloning into 'exp2GO'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 98 (delta 22), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (98/98), done.


In [5]:
path = 'exp2GO/data/'

### Load data

In [11]:
species = '/ara'
gaf = '/gafT0'
obo = '/oboT-1'

semantic_dist = 'Rel'
dist_select = 'min'

In [12]:
# Gaf version ara
if species == '/ara' and gaf == '/gafT-1':
  gaf_version = '131'
elif species == '/ara' and gaf == '/gafT0':
  gaf_version = '138'
elif species == '/ara' and gaf == '/gafT1':
  gaf_version = '147'

# Gaf version dicty and yeast
if (species == '/dicty' or species == '/yeast') and gaf == '/gafT-1':
  gaf_version = '59'
elif (species == '/dicty' or species == '/yeast') and gaf == '/gafT0':
  gaf_version = '66'
elif (species == '/dicty' or species == '/yeast') and gaf == '/gafT1':
  gaf_version = '75'

# Obo version
if obo == '/oboT-1':
  obo_version = '2016-06-01'
elif obo == '/oboT0':
  obo_version = '2017-02-01'
elif obo == '/oboT1':
  obo_version = '2017-11-01'

# Species annotation name
if species == '/ara':
  spec_annot = 'arabidopsis'
elif species == '/dicty':
  spec_annot = 'dicty'
elif species == '/yeast':
  spec_annot = 'yeast'

# GO subontology
subont = 'BP'

In [13]:
terms = path + species + '_terms_gaf' + gaf_version + '_' + subont + '_with_expr_EXP.csv'
obo_file = path + 'gene_ontology_edit_' + obo_version + '.obo'
gaf_file = path + 'goa_' + spec_annot + '.gaf.' + gaf_version

In [14]:
filename = obo_file + '.zip'
!unzip -o -d $path $filename

filename = gaf_file + '.gz'
!gunzip $filename

Archive:  exp2GO/data/gene_ontology_edit_2016-06-01.obo.zip
  inflating: exp2GO/data/gene_ontology_edit_2016-06-01.obo  


### Semantic distance functions

In [15]:
# Get MICA dataframe and MICA IC dictionary for terms

def get_micas_and_ic(terms,         \
                     gene_ontology, \
                     termcounts):

  # get maximum ic term to normalize the ic values
  _, go_term = max(termcounts.gosubdag.go2nt.items(), key=lambda t: t[1].tinfo)
  max_goterm_tinfo = go_term.tinfo

  # micas dataframe and ic dictionary
  micas = pd.DataFrame(np.empty((len(terms),len(terms)),dtype=object), columns=terms, index=terms)
  micas_ic = {}
  for i in range(len(terms)):
    for j in range(i,len(terms)):
      micas.iloc[i][j] = deepest_common_ancestor([terms[i], terms[j]], gene_ontology)
      micas.iloc[j][i] = micas.iloc[i][j]
      micas_ic[micas.iloc[i][j]] = get_info_content(micas.iloc[i][j], termcounts, max_goterm_tinfo)

  return(micas, micas_ic)

In [16]:
def get_gene_distance(gene_terms,           \
                      gene_ontology,        \
                      termcounts,           \
                      micas,                \
                      micas_ic,             \
                      similarity="Res",     \
                      sim_selection="max"):

  if not(similarity != "Res" or similarity != "Lin" or similarity != "Rel"):
    print("Not a valid similarity measure")

  if not(sim_selection != "max" or sim_selection != "min" or sim_selection != "avg" or sim_selection != "bma"):
    print("Not a valid sim selection method")

  n = len(gene_terms.keys())
  distance = np.zeros((n,n), dtype=object)

  # get maximum ic term to normalize the ic values
  _, go_term = max(termcounts.gosubdag.go2nt.items(), key=lambda t: t[1].tinfo)
  max_goterm_tinfo = go_term.tinfo

  # get similarities and distance
  sim_func = {"Res":resnik_sim, "Lin":lin_sim, "Rel":rel_sim}
  sim_measure = sim_func[similarity]

  gene_names = list(gene_terms.keys())
  # for each pair of genes get the term list
  for gene_row in range(n):
    for gene_col in range(gene_row,n):
      set1 = gene_terms[gene_names[gene_row]]
      set2 = gene_terms[gene_names[gene_col]]

      if set1 == [] or set2 == []:
        distance[gene_row,gene_col] = ''
        distance[gene_col,gene_row] = ''

      else:
        term_sim = np.zeros((len(set1), len(set2)))
        # for each pair of term get the similarity
        for term_row in range(len(set1)):
          for term_col in range(term_row,len(set2)):
            if set1[term_row] in gene_ontology.keys() and set2[term_col] in gene_ontology.keys():
              term_sim[term_row, term_col] = sim_measure(set1[term_row], set2[term_col], gene_ontology, termcounts, max_goterm_tinfo)

        if sim_selection == "min":
          distance[gene_row,gene_col] = 1 - np.max(term_sim)
          distance[gene_col,gene_row] = distance[gene_row,gene_col]

        elif sim_selection == "max":
          distance[gene_row,gene_col] = 1 - np.min(term_sim)
          distance[gene_col,gene_row] = distance[gene_row,gene_col]

        elif sim_selection == "avg":
          distance[gene_row,gene_col] = 1 - np.average(term_sim)
          distance[gene_col,gene_row] = distance[gene_row,gene_col]

        elif sim_selection == "bma":
          term_dist = np.ones((len(set1),len(set2))) - term_sim
          max_col = np.amax(term_dist,0)
          max_row = np.amax(term_dist,1)
          distance[gene_row,gene_col] = (np.average(max_col) + np.average(max_row))/2
          distance[gene_col,gene_row] = distance[gene_row,gene_col]

  return(distance)

def get_info_content(go_id, termcounts, max_goterm_tinfo):
    ntd = termcounts.gosubdag.go2nt.get(go_id)
    return ntd.tinfo/max_goterm_tinfo if ntd else 0.0

def resnik_sim(go_id1, go_id2, godag, termcounts, max_goterm_tinfo):
    goterm1 = godag[go_id1]
    goterm2 = godag[go_id2]
    if goterm1.namespace == goterm2.namespace:
        return micas_ic[micas.loc[go_id1][go_id2]]
    return None

def lin_sim(goid1, goid2, godag, termcnts, max_goterm_tinfo, dfltval=None):
    sim_r = resnik_sim(goid1, goid2, godag, termcnts, max_goterm_tinfo)
    return lin_sim_calc(goid1, goid2, sim_r, termcnts, max_goterm_tinfo, dfltval)

def lin_sim_calc(goid1, goid2, sim_r, termcnts, max_goterm_tinfo, dfltval=None):
    # If goid1 and goid2 are in the same namespace
    if sim_r is not None:
        tinfo1 = get_info_content(goid1, termcnts, max_goterm_tinfo)
        tinfo2 = get_info_content(goid2, termcnts, max_goterm_tinfo)
        info = tinfo1 + tinfo2
        # Both GO IDs must be annotated
        if info != 0:
            return (2*sim_r)/info
        if termcnts.go2obj[goid1].item_id == termcnts.go2obj[goid2].item_id:
            return 1.0
        # The GOs are separated by the root term, so are not similar
        if sim_r == 0.0:
            return 0.0

def rel_sim(goid1, goid2, godag, termcnts, max_goterm_tinfo):
  sim_lin = lin_sim(goid1, goid2, godag, termcnts, max_goterm_tinfo)
  sim_res = resnik_sim(goid1, goid2, godag, termcnts, max_goterm_tinfo)
  if sim_lin == None or sim_res == None:
    return(None)
  mica_goid = deepest_common_ancestor([goid1, goid2], godag)
  nt = p_mica = termcounts.gosubdag.go2nt.get(mica_goid)
  if nt:
    p_mica = nt.tfreq
  else:
    p_mica = 0
  sim_rel = sim_lin * (1-p_mica)
  return(sim_rel)

### Terms to dictionary

In [17]:
gene_ontology = GODag(obo_file, optional_attrs={'relationship'})

ns = subont
assc_orig = read_gaf(gaf_file, namespace=ns, godag=gene_ontology)
assc = {}
goids_dag = set(gene_ontology.keys())
for gene, goids_cur in assc_orig.items():
  assc[gene] = goids_cur.intersection(goids_dag)

termcounts = TermCounts(gene_ontology, assc, relationships='part_of')

exp2GO/data/gene_ontology_edit_2016-06-01.obo: fmt(1.2) rel(2016-05-31) 44,636 GO Terms; optional_attrs(relationship)
HMS:0:00:04.844250 150,495 annotations READ: exp2GO/data/goa_arabidopsis.gaf.138 
17838 IDs in loaded association branch, BP


In [18]:
# Get dictionary with gene terms and set of terms
gene_terms = dict()
all_terms = set()
with open(terms, 'r') as file:
  reader = csv.reader(file)
  for row in reader:
    term_list = []
    for term in row[1:]: 
      if term in gene_ontology.keys():
        term_list.append(term)
    gene_terms[row[0]] = term_list
    all_terms = all_terms.union(set(term_list))
all_terms = sorted(list(all_terms))

### Distance

In [19]:
micas, micas_ic = get_micas_and_ic(all_terms, gene_ontology, termcounts)
dist_array = get_gene_distance(gene_terms, gene_ontology, termcounts, micas, micas_ic, semantic_dist, dist_select)

In [20]:
gene_names = list(gene_terms.keys())
dist_array = np.hstack((np.asarray([gene_names]).T, dist_array))
dist_array = np.vstack(([''] + gene_names, dist_array))
df_dist_array = pd.DataFrame(dist_array)

In [22]:
file_name = species + '_semantic_dist_' + subont + '_' + semantic_dist.lower() + '_' + dist_select + '_exp.csv'
df_dist_array.to_csv(path + file_name, index=None, header=None)