# Get ontology data from OBO files

In [1]:
!which python

/Users/kanak/elucidata/envs/kg_env/bin/python


In [None]:
!pip3 install pronto --user

In [None]:
!pip3 install beautifulsoup4

In [3]:
import pronto
import pandas as pd

In [None]:
#Run on terminal
!polly files sync --workspace-id 9475 --source polly:// --destination ./

### BTO (Brenda Tissue Ontology) - Tissue

In [4]:
#Read obo file
bto = pronto.Ontology("/Users/kanak/Downloads/tissue.obo")

### Fetch node properties for tissue

In [5]:
bto_id = []
bto_name = []
bto_def = []
bto_syn = []

for terms in bto.terms():
    temp = []
    if('tissue' in list(terms.subsets)[0]):
        bto_id.append(str(terms.id))
        bto_name.append(str(terms.name))
        bto_def.append(str(terms.definition))
        syn = list(terms.synonyms)
        for s in syn:
            temp.append(str(str(s).split("'")[1]))
        bto_syn.append(','.join(temp))
    else:
        continue
        
bto_df = pd.DataFrame(list(zip(bto_id, bto_name, bto_def, bto_syn)), columns = ['tissue_id', 'name', 'definition', 'synonyms'])
bto_df.shape

(1789, 4)

### Fetch relationships for tissue

In [6]:
bto_sub = []
bto_id = []

for terms in bto.terms():
    temp = []
    if('tissue' in list(terms.subsets)[0]):
        sub = terms.subclasses(with_self = False, distance = 1)
        for s in sub:
            temp.append(str(s.id))
        bto_id.append(terms.id)
        bto_sub.append(temp)
    else:
        continue
        
sub_df = pd.DataFrame(list(zip(bto_id, bto_sub)), columns = ['tissue', 'subclass'])
sub_df = sub_df.explode('subclass')
sub_df = sub_df.dropna(subset= ['subclass'])
sub_df.shape

(903, 2)

In [8]:
rels = []
for r in bto.relationships():
    rels.append(r.id)
    
print(rels)

['bearer_of', 'causually_influences', 'contained_in', 'develops_from', 'disease_arises_from_structure', 'disease_causes_dysfunction_of', 'part_of', 'produced_by', 'realized_in']


In [9]:
bto_part = []
bto_rel = []
bto_id = []

for terms in bto.terms():
    if('tissue' in list(terms.subsets)[0]):
        for rel in rels:
            if rel in str(list(terms.relationships)):
                try:
                    ts = terms.relationships[bto.get_relationship(rel)]
                    bto_id.append(str(terms.id))
                    bto_rel.append(rel)
                    bto_part.append(str(list(ts)[0].id))
                except:
                    print(terms.id)
            else:
                continue
        else:
            continue
        
rel_df = pd.DataFrame(list(zip(bto_id, bto_rel ,bto_part)), columns = ['tissue_id', 'relation', 'target'])
rel_df = rel_df.explode('target')
rel_df = rel_df.dropna(subset= ['target'])
rel_df.shape

(1225, 3)

In [None]:
for rel in rels:
    df = rel_df[rel_df['relation'] == rel]
    df.to_csv(f"graph_data/tissue__{rel}__relation.csv", index = False)

In [None]:
bto_df.to_csv('graph_data/tissue__nodes.csv', index=False)
sub_df.to_csv('graph_data/tissue__subclass.csv', index=False)

### Cell Type Ontology - BTO

In [11]:
#Read obo file
cto = pronto.Ontology( "/Users/kanak/Downloads/tissue.obo")

In [12]:
cto_id = []
cto_name = []
cto_def = []
cto_syn = []

for terms in cto.terms():
    temp = []
    if('cell_type' in list(terms.subsets)[0]):
        cto_id.append(str(terms.id))
        cto_name.append(str(terms.name))
        cto_def.append(str(terms.definition))
        syn = list(terms.synonyms)
        for s in syn:
            temp.append(str(str(s).split("'")[1]))
        cto_syn.append(','.join(temp))
    else:
        continue
        
cto_df = pd.DataFrame(list(zip(cto_id, cto_name, cto_def, cto_syn)), columns = ['cell_type_id', 'name', 'definition', 'synonyms'])
cto_df.shape

(1067, 4)

In [13]:
cto_sub = []
cto_id = []

for terms in cto.terms():
    temp = []
    if('cell_type' in list(terms.subsets)[0]):
        sub = terms.subclasses(with_self = False, distance = 1)
        for s in sub:
            temp.append(str(s.id))
        cto_id.append(terms.id)
        cto_sub.append(temp)
    else:
        continue
        
sub_df = pd.DataFrame(list(zip(cto_id, cto_sub)), columns = ['cell_type_id', 'subclass'])
sub_df = sub_df.explode('subclass')
sub_df = sub_df.dropna(subset= ['subclass'])
sub_df.shape

(712, 2)

In [14]:
cell_type_rels = set()
for terms in cto.terms():
    if('cell_type' in list(terms.subsets)[0]):
        for rel in cto.relationships():
            if rel.id in str(list(terms.relationships)):
                cell_type_rels.add(rel.id)

cell_type_rels

{'develops_from',
 'disease_arises_from_structure',
 'disease_causes_dysfunction_of',
 'part_of',
 'realized_in'}

In [15]:
cto_part = []
cto_rel = []
cto_id = []

for terms in cto.terms():
    if('cell_type' in list(terms.subsets)[0]):
        for rel in cell_type_rels:
            if rel in str(list(terms.relationships)):
                try:
                    ts = terms.relationships[cto.get_relationship(rel)]
                    cto_id.append(str(terms.id))
                    cto_rel.append(rel)
                    cto_part.append(str(list(ts)[0].id))
                except:
                    print(terms.id)
            else:
                continue
        else:
            continue
        
rel_df = pd.DataFrame(list(zip(cto_id, cto_rel ,cto_part)), columns = ['cell_type_id', 'relation', 'target'])
rel_df = rel_df.explode('target')
rel_df = rel_df.dropna(subset= ['target'])
rel_df.shape

BTO:0001413
BTO:0003801


(498, 3)

In [None]:
for rel in cell_type_rels:
    df = rel_df[rel_df['relation'] == rel]
    df.to_csv(f"graph_data/cell_type__{rel}__relation.csv", index = False)

In [None]:
cto_df.to_csv('graph_data/cell_type__nodes.csv', index=False)
sub_df.to_csv('graph_data/cell_type__subclass.csv', index=False)

## Cell Line ontology

In [None]:
clo_v3 = pronto.Ontology('ontologies/obo/cellosaurus-edited-v3.obo')

In [None]:
clo_id = []
clo_name = []
clo_category = []
clo_syn = []
clo_gender = []

gender_terms = ['Male','Female','Mixed_sex','Sex_ambiguous','Sex_unspecified']

for terms in clo_v3.terms():
    temp = []
    clo_id.append(str(terms.id))
    clo_name.append(str(terms.name))
    if len(list(terms.subsets)) == 2:
        if str(list(terms.subsets)[0]) in gender_terms:
            clo_gender.append(str(list(terms.subsets)[0]).replace('_',' '))
            clo_category.append(str(list(terms.subsets)[1]).replace('_',' '))
        elif str(list(terms.subsets)[1]) in gender_terms:
            clo_gender.append(str(list(terms.subsets)[1]).replace('_',' '))
            clo_category.append(str(list(terms.subsets)[0]).replace('_',' '))
    else:
        if str(list(terms.subsets)[0]) in gender_terms:
            clo_gender.append(str(list(terms.subsets)[0]).replace('_',' '))
            clo_category.append(" ")
        else:
            clo_gender.append(" ")
            clo_category.append(str(list(terms.subsets)[0]).replace('_',' '))

    syn = list(terms.synonyms)
    for s in syn:
        temp.append(str(str(s).split("'")[1]))
    clo_syn.append(','.join(temp))
    
# print('id:',len(clo_id))
# print('name:',len(clo_name))
# print('gender:',len(clo_gender))
# print('category:',len(clo_category))
# print('synonyms:',len(clo_syn))
        
class_df = pd.DataFrame(list(zip(clo_id, clo_name, clo_gender,clo_category, clo_syn)), columns = ['cell_line_id', 'name', 'gender','category', 'synonyms'])
class_df

In [None]:
set(class_df['category'])
# set(class_df['gender'])

In [None]:
#extracting cell_line-disease from another obo file
clo_2 = pronto.Ontology('ontologies/obo/cell_line.obo')

clo_dis = []
clo_id = []        

for term in clo_2.terms():
    for xref in term.xrefs:
        if xref.id.startswith("MESH"):
            clo_id.append(term.id)
            clo_dis.append(xref.id)
        else:
            continue
            
dis = pd.DataFrame(list(zip(clo_id, clo_dis)), columns = ['cell_line_id', 'disease'])   
dis

In [None]:
dis.to_csv('graph_data/cell_line__disease.csv', index=False)

### Linking Tissue to Cell-Line Ontology

In [None]:
cell_df = pd.read_csv('tissue_cell_line.csv')
cell_df

In [None]:
#Clean tissue column
#Create a secondary tissue column
cell_df['secondary_tissue'] = cell_df['Tissue'].str.split(';').str[1]
cell_df['secondary_tissue'] = cell_df['secondary_tissue'].str.strip(' ')
#Tissue column only includes the primary tissue terms
cell_df['Tissue'] = cell_df['Tissue'].str.split(';').str[0]

In [None]:
#Clean tissue column
for index, row in cell_df.iterrows():
    if '=' in str(row['Tissue']):
        row['Tissue'] = ''

In [None]:
#Subset using CLO
cell_df = cell_df[cell_df['ACC'].isin(class_df['cell_line_id'])]
#drop Na in Tissue
cell_df = cell_df.dropna(subset=['Tissue'])
cell_df['Tissue'] = cell_df['Tissue'].apply(lambda x: x.lower())

In [None]:
cell_df

In [None]:
#Map tissue terms to BTO

bto_clo = bto_df.merge(cell_df, how='inner', left_on='name', right_on='Tissue')
bto_clo = bto_clo[['tissue_id', 'ACC']]
bto_clo.columns = ['tissue', 'cell_line']
bto_clo

In [None]:
bto_clo.to_csv('graph_data/tissue__cell_line.csv', index=False)

In [None]:
rels_clo = []
for r in clo_v3.relationships():
    rels_clo.append(r.id)
    
rels_clo

In [None]:
clo_tar = []
clo_rel = []
clo_id = []

for terms in clo_v3.terms():
    for rel in rels_clo:
        if rel in str(list(terms.relationships)):
            ts = terms.relationships[clo_v3.get_relationship(rel)]
            clo_id.append(str(terms.id))
            clo_rel.append(rel)
            clo_tar.append(str(list(ts)[0].id))
        else:
            continue
    else:
        continue
        
rel_df = pd.DataFrame(list(zip(clo_id, clo_rel ,clo_tar)), columns = ['cell_line_id', 'relation', 'target'])
rel_df = rel_df.explode('target')
rel_df = rel_df.dropna(subset= ['target'])
rel_df

In [None]:
for rel in rels_clo:
    df = rel_df[rel_df['relation'] == rel]
    df.to_csv(f"graph_data/cell_line__{rel}__relation.csv", index = False)

In [None]:
class_df.to_csv('graph_data/cell_line__nodes.csv', index=False)

## MeSH Ontology

In [None]:
mesh = pronto.Ontology('ontologies/obo/disease.obo')

In [None]:
mesh_id = []
mesh_name = []
mesh_syn = []

for terms in mesh.terms():
    temp = []
    mesh_id.append(str(terms.id))
    mesh_name.append(str(terms.name))
    syn = list(terms.synonyms)
    for s in syn:
        temp.append(str(str(s).split("'")[1]))
    mesh_syn.append(','.join(temp))
        
class_df = pd.DataFrame(list(zip(mesh_id, mesh_name, mesh_syn)), columns = ['id', 'name', 'synonyms'])
class_df

In [None]:
mesh_sub = []
mesh_id = []

for terms in mesh.terms():
    temp = []
    mesh = terms.subclasses(with_self = False, distance = 1)
    for m in mesh:
        temp.append(str(m.id))
    mesh_id.append(terms.id)
    mesh_sub.append(temp)
    
        
sub_df = pd.DataFrame(list(zip(mesh_id, mesh_sub)), columns = ['disease', 'subclass'])
sub_df = sub_df.explode('subclass')
sub_df = sub_df.dropna(subset= ['subclass'])
sub_df

In [None]:
class_df.to_csv('graph_data/disease__nodes.csv', index=False)
sub_df.to_csv('graph_data/disease__subclass.csv', index=False)

### Disease Gene Associations

In [None]:
import math
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from multiprocessing import Process,Manager
from harmonizomeapi import Harmonizome, Entity

In [None]:
def return_json(addr):
	url = base_url + addr
	response = requests.get(url)

	if response.status_code == 200:
		try:
			data = json.loads(response.text)
			return data
		except:
			return " "
	
	else:
		print("Error")
		print(addr)



def get_mesh_id(disease):
    try:
        mesh_request = requests.get("http://id.nlm.nih.gov/mesh/lookup/descriptor",params={'label':disease})
        mesh_response = mesh_request.json()[0]['resource'].split('/')
        mesh_id = mesh_response[len(mesh_response) - 1]
        return mesh_id
    except:
        print('Error while finding in disease - ', disease)
    return None


def convert_name_to_url(name):
	
	def Convert(string):
		list1=[]
		list1[:0]=string
		return list1

	name_lst1 = Convert(name)
	name_lst2 = name_lst1[:]

	for i in range(len(name_lst1)):
		if name_lst1[i] == ',':
			name_lst1[i] = '%'
			name_lst2[i] = '%2C'

		elif name_lst1[i] == " ":
			name_lst1[i] = "+"
			name_lst2[i] = "+"

	url_name1 = ""
	for i in name_lst1:
		url_name1 += i

	url_name2 = ""
	for i in name_lst2:
		url_name2 += i


	url1 = "https://maayanlab.cloud/Harmonizome/gene_set/" + url_name1 + "/CTD+Gene-Disease+Associations"
	url2 = "https://maayanlab.cloud/Harmonizome/gene_set/" + url_name2 + "/CTD+Gene-Disease+Associations"

	html2 = requests.get(url2)

	if html2.status_code == 200:
		return url2
	else:
		return url1

def find_mesh_id(url):
	html_text = requests.get(url).text
	soup = BeautifulSoup(html_text,'html.parser')

	for link in soup.find_all('a'):
		link_url = link.get('href')
		if "ctdbase.org" in link_url:
			mesh_id = link_url.split('=')[-1]
			# mesh, ID = mesh_id.split(':')
			return mesh_id


In [None]:
def disease_gene_associations(genesets,num,return_dict):
    dis_gene_dict = {'Disease_Name':[],
                    'ID':[],
                    'Associated_Gene_Symbols':[]}
    
    for geneset in genesets:
        disease_name = geneset['name'].split('/')[0]
        dis_gene_dict['Disease_Name'].append(disease_name)
        
        gene_info = return_json(geneset['href'])
        genes = ""
        for j in range(len(gene_info['associations'])):
            genes += gene_info['associations'][j]['gene']['symbol']
            if j != len(gene_info['associations'])-1:
                genes += ','
        dis_gene_dict['Associated_Gene_Symbols'].append(genes)
        
        url = convert_name_to_url(disease_name)
        mesh_id = find_mesh_id(url)
        dis_gene_dict['ID'].append(mesh_id)
        
#     return dis_gene_dict
    return_dict[num]=dis_gene_dict

In [None]:
dataset_lst = Harmonizome.get(Entity.DATASET)

base_url = "https://maayanlab.cloud/Harmonizome"

# The dataset CTD - Disease Gene Associations
dataset = return_json(dataset_lst['entities'][24]['href'])['geneSets']

In [None]:
num_processes = 15

return_dict = Manager().dict()
jobs = []

for i in range(num_processes):
    lb = i*math.ceil(len(dataset)/num_processes)
    up = min((i+1)*math.ceil(len(dataset)/num_processes),len(dataset))
    
    p = Process(target=disease_gene_associations,args=(dataset[lb:up],i,return_dict))
    jobs.append(p)
    p.start()
    
for process in jobs:
    print()
    process.join()


In [None]:
for process in jobs:
    print(process)

In [None]:
df_lst = []

for i in range(len(return_dict)):
    df_lst.append(pd.DataFrame(return_dict[i]))
    
df = pd.concat(df_lst)

In [None]:
def to_list(row):
	# print(row)
	if str(row['Associated_Gene_Symbols']) != 'nan':
		return row['Associated_Gene_Symbols'].split(',')
	else:
		return []

df['Associated_Genes'] = df.apply(to_list,axis=1)
df = df[['ID','Associated_Genes']]


In [None]:
mesh_dis = pd.read_csv("mesh_diseases.csv")
dis = mesh_dis[['ID']]

final_df = pd.merge(df,dis,on='ID',how='inner')
final_df = final_df.explode('Associated_Genes')
final_df = final_df.dropna(subset=['Associated_Genes'])
final_df.columns = ['disease_id','gene']
final_df.to_csv("graph_data/disease__gene.csv",sep='\t',index=False)

### Gene Properties

In [None]:
genes = Harmonizome.get(Entity.GENE)

In [None]:
gene_lst = []

for i in range(568):
    for j in range(len(genes['entities'])):
        gene_lst.append(genes['entities'][j]['symbol'])
    if i != 567:
        genes = Harmonizome.next(genes)

In [None]:
def return_json(url):
	response = requests.get(url)

	if response.status_code == 200:
		data = json.loads(response.text)
		return data
    
	else:
		return {"symbol":"",
                "synonyms":[],
                "name":"",
                "description":"",
                "ncbiEntrezGeneId":-1,
                "ncbiEntrezGeneUrl":"",
                "proteins":[],
                "hgncRootFamilies":[]}

In [None]:
def gene_properties(gene_lst,num,return_dict):
    base_url = "https://maayanlab.cloud/Harmonizome/api/1.0/gene/"
    
    gene_prop_dict = {"Gene_Symbol":[],
                     "Synonyms":[],
                     "Gene_Name":[],
                     "Description":[],
                     "NcbiEntrezGeneId":[],
                     "NcbiEntrezGeneUrl":[],
                     "Proteins":[],
                     "HgncRootFamilies":[]}
    
    for gene in gene_lst:
        url = base_url + gene
        gene_props = return_json(url)
        gene_prop_dict['Gene_Symbol'].append(gene)
        
        synonyms = ""
        if gene_props['synonyms'] != []:
            for i in range(len(gene_props['synonyms'])):
                synonyms += gene_props['synonyms'][i]
                if i != len(gene_props['synonyms'])-1:
                    synonyms += ','
        gene_prop_dict['Synonyms'].append(synonyms)
        
        gene_prop_dict['Gene_Name'].append(gene_props['name'])
        gene_prop_dict['Description'].append(gene_props['description'])
        gene_prop_dict['NcbiEntrezGeneId'].append(gene_props['ncbiEntrezGeneId'])
        gene_prop_dict['NcbiEntrezGeneUrl'].append(gene_props['ncbiEntrezGeneUrl'])
        
        proteins = ""
        if gene_props['proteins'] != []:
            for i in range(len(gene_props['proteins'])):
                proteins += gene_props['proteins'][i]['symbol']
                if i != len(gene_props['proteins'])-1:
                    proteins += ','
        gene_prop_dict['Proteins'].append(proteins)
                

        root_family = ""
        if gene_props['hgncRootFamilies'] != []:
            for i in range(len(gene_props['hgncRootFamilies'])):
                root_family += gene_props['hgncRootFamilies'][i]['name']
                if i != len(gene_props['hgncRootFamilies'])-1:
                    root_family += ','
        gene_prop_dict['HgncRootFamilies'].append(root_family)
        
        
    return_dict[num] = gene_prop_dict
        
        

In [None]:
dis_gene = pd.read_csv("graph_data/disease__gene.csv",sep='\t')

gene_list = list(dis_gene['gene'])

gene_lst = list(set(gene_list))

In [None]:
num_processes = 40

return_dict = Manager().dict()
jobs = []

for i in range(num_processes):
    lb = i*math.ceil(len(gene_lst)/num_processes)
    up = min((i+1)*math.ceil(len(gene_lst)/num_processes),len(gene_lst))
    
    p = Process(target=gene_properties,args=(gene_lst[lb:up],i,return_dict))
    jobs.append(p)
    p.start()
    
for process in jobs:
    process.join()


In [None]:
df_lst = []

for i in range(len(return_dict)):
    df_lst.append(pd.DataFrame(return_dict[i]))
    
df = pd.concat(df_lst)

In [None]:
df.to_csv("graph_data/gene__nodes.csv",sep='\t',index=None)

In [None]:
#Run on terminal
!polly files sync --workspace-id 9475 --source graph_data/ --destination polly://graph_data/