In [9]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io

In [4]:
url = 'https://rare.cohd.io/api'

##### View Dataset

In [24]:
response = requests.get(url + '/metadata/datasets', verify=False)
print(response)
response_json = response.json()
dataset_df = pd.DataFrame(response_json['results'])
dataset_df.head()

<Response [200]>




Unnamed: 0,clinical_site,dataset_id,source,subclass_category,subpopulation
0,cuimc,1,ohdsi,all\r,all
1,cuimc,2,notes,all\r,all
2,chop,3,notes,all\r,all
3,cuimc,10,ohdsi,neonates (0-2)\r,age
4,cuimc,11,ohdsi,kid (3-11)\r,age


##### View domain counts
- view the number of concepts with larger than 10 patient counts for different domains in a given dataset

In [28]:
params = {"dataset_id" : "2"}
response = requests.get(url + '/metadata/domainCounts',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()

<Response [200]>




Unnamed: 0,count,dataset_id,domain_id
0,2390,2,diseases
1,5968,2,phenotypes


##### View domain pair counts
- view the number of concept pairs with larger than 10 patient counts for different domains in a given dataset
- notice the concept pair is recorded in a sequence (i.e. no phenotype-disease concept pairs) so that the same disease-phenotype pair only counted once.

In [29]:
params = {"dataset_id" : "2"}
response = requests.get(url + '/metadata/domainPairCounts',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()

<Response [200]>




Unnamed: 0,count,dataset_id,domain_id_1,domain_id_2
0,23329,2,diseases,diseases
1,404993,2,diseases,phenotypes
2,1320156,2,phenotypes,phenotypes


##### View patient counts

In [30]:
params = {"dataset_id" : "2"}
response = requests.get(url + '/metadata/patientCounts',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()

<Response [200]>




Unnamed: 0,count,dataset_id
0,1349866,2


##### View most frequently occurred concept in a dataset

In [32]:
params = {"dataset_id" : "2", "domain_id": "diseases"}
response = requests.get(url + '/frequencies/mostFrequency',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()



<Response [200]>


Unnamed: 0,concept_code,concept_count,concept_frequency,concept_id,concept_name,domain_id,vocabulary_id
0,MONDO:0008315,46354,0.03434,80008315,prostate cancer,diseases,mondo
1,MONDO:0007452,31192,0.023107,80007452,maturity-onset diabetes of the young type 1,diseases,mondo
2,MONDO:0009061,31061,0.02301,80009061,cystic fibrosis,diseases,mondo
3,MONDO:0007256,28812,0.021344,80007256,hepatocellular carcinoma,diseases,mondo
4,MONDO:0004994,24388,0.018067,80004994,cardiomyopathy,diseases,mondo


##### View most frequently co-occurred concepts a given concept in a dataset

In [33]:
params = {"dataset_id" : "2", "concept_id": 90012461}
response = requests.get(url + '/frequencies/mostFrequency',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()



<Response [200]>


Unnamed: 0,concept_code_1,concept_code_2,concept_count_1,concept_count_2,concept_frequency,concept_id_1,concept_id_2,concept_name_1,concept_name_2,concept_pair_count,domain_id_1,domain_id_2,vocabulary_id_1,vocabulary_id_2
0,HP:0012461,HP:0012531,1874,476026,0.001133,90012461,90012531,Bacteriuria,Pain,1530,phenotypes,phenotypes,hpo,hpo
1,HP:0012461,HP:0002664,1874,320653,0.000842,90012461,90002664,Bacteriuria,Neoplasm,1137,phenotypes,phenotypes,hpo,hpo
2,HP:0012461,HP:0000822,1874,234731,0.00073,90012461,90000822,Bacteriuria,Hypertension,986,phenotypes,phenotypes,hpo,hpo
3,HP:0012461,HP:0100518,1874,65685,0.000646,90012461,90100518,Bacteriuria,Dysuria,872,phenotypes,phenotypes,hpo,hpo
4,HP:0012461,HP:0000790,1874,59769,0.000622,90012461,90000790,Bacteriuria,Hematuria,839,phenotypes,phenotypes,hpo,hpo


##### View concept pair frequency
- to save time, multiple concepts can be inputted (seperated by ;)

In [36]:
params = {"dataset_id" : "1", "concept_id_1": "90012461;90003641", "concept_id_2": "90500111"}
response = requests.get(url + '/frequencies/pairedConceptFreq',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()



<Response [200]>


Unnamed: 0,concept_code_1,concept_code_2,concept_count_1,concept_count_2,concept_frequency,concept_id_1,concept_id_2,concept_name_1,concept_name_2,concept_pair_count,domain_id_1,domain_id_2,vocabulary_id_1,vocabulary_id_2
0,HP:0012461,HP:0500111,50915,6478,0.00039,90012461,90500111,Bacteriuria,Positive urine benzodiazepines test,953,phenotypes,phenotypes,hpo,hpo


##### View concept single concept frequency
- to save time, multiple concepts can be inputted (seperated by ;)

In [38]:
params = {"dataset_id" : "1", "concept_id": "90012461;90003641"}
response = requests.get(url + '/frequencies/singleConceptFreq',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()

<Response [200]>




Unnamed: 0,concept_code,concept_count,concept_frequency,concept_id,concept_name,domain_id,vocabulary_id
0,HP:0012461,50915,0.020852,90012461,Bacteriuria,phenotypes,hpo
1,HP:0003641,331,0.000136,90003641,Hemoglobinuria,phenotypes,hpo


##### View chi-square of concept pairs
- to save time, multiple concepts can be inputted (seperated by ;)
- if not concept_id_2 provided, concept pair with largest chi-square will be returned
- it is similar to other statistics

In [42]:
params = {"dataset_id" : "1", "concept_id_1": "90012461;90003641", "concept_id_2": "90500111"}
response = requests.get(url + '/association/chiSquare',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()

<Response [200]>




Unnamed: 0,concept_code_2,concept_id_2,concept_name_2,domain_id_2,vocabulary_id_2,ws_cs,z_details
0,HP:0500111,90500111,Positive urine benzodiazepines test,phenotypes,hpo,5071.550474,"[{'adj_p': 0.0, 'concept_code_1': 'HP:0012461'..."


In [44]:
params = {"dataset_id" : "1", "concept_id_1": "90012461", "top_n": 3, "domain_id": "phenotypes"}
response = requests.get(url + '/association/chiSquare',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()



<Response [200]>


Unnamed: 0,concept_code_2,concept_id_2,concept_name_2,domain_id_2,vocabulary_id_2,ws_cs,z_details
0,HP:0031812,90031812,Nitrituria,phenotypes,hpo,138293.218337,"[{'adj_p': 0.0, 'concept_code_1': 'HP:0012461'..."
1,HP:0100518,90100518,Dysuria,phenotypes,hpo,47659.197756,"[{'adj_p': 0.0, 'concept_code_1': 'HP:0012461'..."
2,HP:0001649,90001649,Tachycardia,phenotypes,hpo,40400.379524,"[{'adj_p': 0.0, 'concept_code_1': 'HP:0012461'..."


##### Find concept by any string
- seperate by ;
- system will smartly detect the matching

In [46]:
params = {"q" : "MONDO:0000171;muscular dystrophy; 90003498", "domain_id": "phenotypes"}
response = requests.get(url + '/vocabulary/findConceptByAny',params= params, verify=False)
print(response)
response_json = response.json()
domain_counts_df = pd.DataFrame(response_json['results'])
domain_counts_df.head()



<Response [200]>


Unnamed: 0,concept_code,concept_id,concept_name,domain_id,vocabulary_id
0,HP:0003741,90003741,Congenital muscular dystrophy,phenotypes,hpo
1,HP:0007081,90007081,Late-onset muscular dystrophy,phenotypes,hpo
2,HP:0008970,90008970,Scapulohumeral muscular dystrophy,phenotypes,hpo
3,HP:0003560,90003560,Muscular dystrophy,phenotypes,hpo
4,HP:0006785,90006785,Limb-girdle muscular dystrophy,phenotypes,hpo
