1. Flatten the primary annotations and their closures in to a single list for each disease
2. Do termset comparison of the query disease to each of the other diseases and calculates simple jaccard index.
3. Sort the results by jaccard index and return the top 10 results.

In [137]:
import pandas as pd
import time

## Get D2P Associations from Monarch Edges

In [91]:
edges = pd.read_csv('monarch-kg (9)/monarch-kg_edges.tsv', sep='\t', dtype=str)

In [92]:
disease_phenotypes = edges[(edges['category'] == 'biolink:DiseaseToPhenotypicFeatureAssociation') & (edges['subject'].str.contains("MONDO"))]

In [97]:
d2p_annotations = disease_phenotypes[['subject', 'object']]
d2p_annotations.head()

Unnamed: 0,subject,object
2869978,MONDO:0023659,HP:0011097
2869979,MONDO:0023659,HP:0002187
2869980,MONDO:0023659,HP:0001518
2869981,MONDO:0023659,HP:0032792
2869982,MONDO:0023659,HP:0011451


## Get Inferred subClassOf relations from Phenio for all HP terms

In [3]:
phen_rel = pd.read_csv('phenio-relation-graph.tsv', sep='\t', dtype=str)

In [5]:
phen_rel.columns = ['subject', 'predicate', 'object']

In [9]:
pr_isa = phen_rel[(phen_rel['predicate'] == 'rdfs:subClassOf') & (phen_rel['subject'].str.startswith('HP:')) & (phen_rel['object'].str.startswith('HP:'))]

## Join the D2P annotations to the inferred subClassOf relations

In [99]:
merged_d2p = pd.merge(left=d2p_annotations, right=pr_isa, how='left', left_on='object', right_on='subject')
merged_d2p.head()

Unnamed: 0,subject_x,object_x,subject_y,predicate,object_y
0,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0000001
1,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0011097
2,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0000118
3,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0020219
4,MONDO:0023659,HP:0011097,HP:0011097,rdfs:subClassOf,HP:0000707


In [100]:
expanded_diseases = merged_d2p.groupby('subject_x')['object_y'].apply(set).reset_index()

In [136]:
def jaccard_index(row, query_set):
    set1 = row['object_y']
    set2 = query_set
    intersection = len(set(set1).intersection(set2))
    union = len(set1.union(set2))
    return intersection / union



In [139]:
rand_disease = expanded_diseases.sample(1)
rand_disease_profile = rand_disease['object_y'].iloc[0]
rand_disease_profile

{'HP:0000001',
 'HP:0000008',
 'HP:0000032',
 'HP:0000035',
 'HP:0000037',
 'HP:0000044',
 'HP:0000078',
 'HP:0000080',
 'HP:0000118',
 'HP:0000119',
 'HP:0000133',
 'HP:0000135',
 'HP:0000137',
 'HP:0000138',
 'HP:0000147',
 'HP:0000811',
 'HP:0000812',
 'HP:0000818',
 'HP:0008373',
 'HP:0008715',
 'HP:0010460',
 'HP:0010461',
 'HP:0012243',
 'HP:0031065'}

In [151]:
start = time.time()
expanded_diseases['jaccard'] = expanded_diseases.apply(jaccard_index, query_set=rand_disease_profile, axis=1)

end = time.time()
print(end-start)
expanded_diseases.sort_values('jaccard', ascending=False).head(10)

0.1592550277709961


Unnamed: 0,subject_x,object_y,jaccard
3273,MONDO:0010765,"{HP:0000135, HP:0000037, HP:0000137, HP:000011...",1.0
9856,MONDO:0100249,"{HP:0000135, HP:0000137, HP:0000022, HP:000011...",0.588235
2721,MONDO:0010106,"{HP:0000135, HP:0000119, HP:0003241, HP:000873...",0.433333
5610,MONDO:0013913,"{HP:0000135, HP:0000119, HP:0000036, HP:000324...",0.384615
9735,MONDO:0054850,"{HP:0000135, HP:0000119, HP:0003241, HP:003001...",0.371429
7017,MONDO:0016281,"{HP:0000047, HP:0000137, HP:0000119, HP:000002...",0.369565
4501,MONDO:0012494,"{HP:0000032, HP:0000001, HP:0012215, HP:000011...",0.36
9608,MONDO:0044626,"{HP:0008222, HP:0000137, HP:0000119, HP:000014...",0.358974
5401,MONDO:0013664,"{HP:0000032, HP:0000001, HP:0000028, HP:001224...",0.357143
5623,MONDO:0013926,"{HP:0000135, HP:0000119, HP:0012638, HP:000324...",0.357143


In [150]:
start = time.time()
scores = []
for index, row in expanded_diseases.iterrows():
    scores.append(jaccard_index(row, query_set=rand_disease_profile))
end = time.time()
print(end-start)

0.9199008941650391
