1. Flatten the primary annotations and their closures in to a single list for each disease
2. Do termset comparison of the query disease to each of the other diseases and calculates simple jaccard index.
3. Sort the results by jaccard index and return the top 10 results.

In [157]:
import pandas as pd
import time
from oaklib import get_adapter

## Get D2P Associations from Monarch Edges

In [91]:
edges = pd.read_csv('monarch-kg (9)/monarch-kg_edges.tsv', sep='\t', dtype=str)

In [92]:
disease_phenotypes = edges[(edges['category'] == 'biolink:DiseaseToPhenotypicFeatureAssociation') & (edges['subject'].str.contains("MONDO"))]

In [97]:
d2p_annotations = disease_phenotypes[['subject', 'object']]
d2p_annotations.head()

Unnamed: 0,subject,object
2869978,MONDO:0023659,HP:0011097
2869979,MONDO:0023659,HP:0002187
2869980,MONDO:0023659,HP:0001518
2869981,MONDO:0023659,HP:0032792
2869982,MONDO:0023659,HP:0011451


## Get Inferred subClassOf relations from Phenio for all HP terms

In [3]:
phen_rel = pd.read_csv('phenio-relation-graph.tsv', sep='\t', dtype=str)

In [5]:
phen_rel.columns = ['subject', 'predicate', 'object']

In [9]:
pr_isa = phen_rel[(phen_rel['predicate'] == 'rdfs:subClassOf') & (phen_rel['subject'].str.startswith('HP:')) & (phen_rel['object'].str.startswith('HP:'))]

## Join the D2P annotations to the inferred subClassOf relations

In [99]:
merged_d2p = pd.merge(left=d2p_annotations, right=pr_isa, how='left', left_on='object', right_on='subject')
merged_d2p.head()

Unnamed: 0,subject_x,object_x,subject_y,predicate,object_y
0,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0000001
1,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0011097
2,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0000118
3,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0020219
4,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0000707


## Flatten the annotations and their closures in to a single list for each disease

In [100]:
expanded_diseases = merged_d2p.groupby('subject_x')['object_y'].apply(set).reset_index()

## Add labels to the diseases

In [162]:
handle = get_adapter("sqlite:obo:mondo")
expanded_diseases['subject_label'] = expanded_diseases['subject_x'].apply(lambda x: handle.label(x))

## Jaccard search

In [163]:
def jaccard_index(row, query_set):
    set1 = row['object_y']
    set2 = query_set
    intersection = len(set(set1).intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def get_label(id):

    return adapter.get_label(id)

## Create a random disease profile

In [164]:
rand_disease = expanded_diseases.sample(1)
rand_disease_profile = rand_disease['object_y'].iloc[0]
rand_disease_profile

{'HP:0000001',
 'HP:0000118',
 'HP:0000152',
 'HP:0000153',
 'HP:0000157',
 'HP:0000160',
 'HP:0000163',
 'HP:0000168',
 'HP:0000212',
 'HP:0000234',
 'HP:0000271',
 'HP:0000280',
 'HP:0000315',
 'HP:0000478',
 'HP:0000479',
 'HP:0000587',
 'HP:0000648',
 'HP:0000707',
 'HP:0000708',
 'HP:0000765',
 'HP:0000818',
 'HP:0000924',
 'HP:0000925',
 'HP:0000926',
 'HP:0000943',
 'HP:0000944',
 'HP:0001098',
 'HP:0001197',
 'HP:0001250',
 'HP:0001251',
 'HP:0001252',
 'HP:0001257',
 'HP:0001268',
 'HP:0001276',
 'HP:0001288',
 'HP:0001290',
 'HP:0001376',
 'HP:0001387',
 'HP:0001392',
 'HP:0001438',
 'HP:0001507',
 'HP:0001508',
 'HP:0001622',
 'HP:0001626',
 'HP:0001627',
 'HP:0001643',
 'HP:0001743',
 'HP:0001744',
 'HP:0001787',
 'HP:0001871',
 'HP:0001881',
 'HP:0001939',
 'HP:0001982',
 'HP:0001999',
 'HP:0002011',
 'HP:0002012',
 'HP:0002015',
 'HP:0002059',
 'HP:0002060',
 'HP:0002118',
 'HP:0002119',
 'HP:0002123',
 'HP:0002197',
 'HP:0002240',
 'HP:0002344',
 'HP:0002376',
 'HP:00024

In [167]:
rand_disease[['subject_label', 'subject_x']]

Unnamed: 0,subject_label,subject_x
1993,GM1 gangliosidosis type 2,MONDO:0009261


## Search for similar diseases using pandas apply

In [165]:
start = time.time()
expanded_diseases['jaccard'] = expanded_diseases.apply(jaccard_index, query_set=rand_disease_profile, axis=1)
end = time.time()
print(end-start)
expanded_diseases.sort_values('jaccard', ascending=False).head(10)[['subject_label', 'subject_x', 'jaccard']]

0.2281661033630371


Unnamed: 0,subject_label,subject_x,jaccard
1993,GM1 gangliosidosis type 2,MONDO:0009261,1.0
1994,GM1 gangliosidosis type 3,MONDO:0009262,0.383929
8175,juvenile sialidosis type 2,MONDO:0019681,0.356
1986,fucosidosis,MONDO:0009254,0.352381
7550,GM1 gangliosidosis,MONDO:0018149,0.348901
2423,"Niemann-Pick disease, type C1",MONDO:0009757,0.336406
1998,Gaucher disease type II,MONDO:0009266,0.334677
1992,GM1 gangliosidosis type 1,MONDO:0009260,0.332394
2704,mucosulfatidosis,MONDO:0010088,0.327869
2654,"free sialic acid storage disease, infantile form",MONDO:0010027,0.318008


## Search for similar diseases using a for loop

In [168]:
start = time.time()
scores = []
for index, row in expanded_diseases.iterrows():
    scores.append(jaccard_index(row, query_set=rand_disease_profile))
end = time.time()
print(end-start)

0.7511029243469238
