# Annotations by Risk Scores Sourcing Data from FHIR Server

## Define a function to read data from the FHIR server

In [1]:
import json
import requests


def get(url):
    response = requests.get(
        url=url,
        headers={"Content-Type": "application/fhir+json", "authentication": "mjRmoNGW6klxaClkKhEkqi7HVYwx6NTH"},
    )
    return response


def readData(url):
    nextUrl = url
    data = []
    while(nextUrl):
        print('Reading URL: ', nextUrl)
        response = get(nextUrl)
        responseText = json.loads(response.text)
        data.append(responseText)
        nextUrl = None
        if 'link' in responseText:
            for link in responseText['link']:
                if link['relation'] == 'next':
                    nextUrl = link['url']
    return data

## Extract Data

### Obtain Patient IDs

In [2]:
[lowerRiskScore, higherRiskScore] = [0.8, 1.0]
query = 'http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge' + str(lowerRiskScore) + '&_has:RiskAssessment:subject:probability=le' + str(higherRiskScore)
response = readData(query)

Reading URL:  http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge0.8&_has:RiskAssessment:subject:probability=le1.0


In [3]:
import itertools

patientIds = list(itertools.chain.from_iterable(list(map(lambda data: list(map(lambda entry: entry['resource']['id'], data['entry'])), response))))
patientIds

['P2198313',
 'P745962',
 'P631550',
 'P2107492',
 'P2156000',
 'P677694',
 'P2161817',
 'P2141593',
 'P2092580',
 'P2199705']

### Obtain Annotations

In [4]:
import xml.etree.ElementTree as ET
import pandas as pd


def getRowFromEntry(entry):
    genomicSourceType = None
    annotationType = None
    for component in entry['resource']['component']:
        if (component['code']['coding'][0]['code'] == '48019-4'):
            annotationType = component['valueString']
        elif (component['code']['coding'][0]['code'] == '48002-0'):
            genomicSourceType = component['valueString']
    return [
        entry['resource']['id'],
        ET.fromstring(entry['resource']['text']['div']).text.strip(),
        genomicSourceType,
        annotationType,
        ]

rows = []
for patientId in patientIds:
    query = 'http://10.172.235.4:8080/fhir/Observation?code=annotation&code=staphylococcus_aureus&subject=' + patientId
    response = readData(query)
    rows.append(list(map(lambda data: list(map(getRowFromEntry, data['entry'])), response)))
annotationsDf = pd.DataFrame([x for xss in rows for xs in xss for x in xs])
annotationsDf

Reading URL:  http://10.172.235.4:8080/fhir/Observation?code=annotation&code=staphylococcus_aureus&subject=P2198313
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=443330d0-50b1-47c0-875d-4f4e3b35f629&_getpagesoffset=20&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=443330d0-50b1-47c0-875d-4f4e3b35f629&_getpagesoffset=40&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=443330d0-50b1-47c0-875d-4f4e3b35f629&_getpagesoffset=60&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=443330d0-50b1-47c0-875d-4f4e3b35f629&_getpagesoffset=80&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=443330d0-50b1-47c0-875d-4f4e3b35f629&_getpagesoffset=100&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=443330d0-50b1-47c0-875d-4f4e3b35f629&_getpagesoffset=1

Unnamed: 0,0,1,2,3
0,2198313-602338-ODONDA-07945,ODONDA_07945,Prodigal,CDS
1,2198313-602338-ODONDA-04545,ODONDA_04545,,
2,2198313-602338-ODONDA-13265,ODONDA_13265,Prodigal,CDS
3,2198313-602338-ODONDA-08750,ODONDA_08750,,
4,2198313-602338-ODONDA-00565,ODONDA_00565,,
...,...,...,...,...
27100,2199705-12537311-feoB,feoB,Prodigal,CDS
27101,2199705-12537311-crtQ,crtQ,Prodigal,CDS
27102,2199705-12537311-rpsN,rpsN,Prodigal,CDS
27103,2199705-12537311-padR,padR,Prodigal,CDS


In [5]:
annotationsDf.columns = ['id', 'name', 'source', 'type']

In [8]:
import os


annotationsDf.to_csv(os.environ['EHR_INT_ANALYSIS_BASE'] + '/data/visuals_for_publication/annotations_riskscore_0.8_1.0.csv', index=False)

In [6]:
annotationsDf.type.value_counts()

CDS                  20364
ncRNA                  467
region                 334
tRNA                   318
regulatory_region      172
rRNA                    28
oriC                    21
tmRNA                    8
oriT                     8
CRISPR                   1
Name: type, dtype: int64

In [9]:
annotationsDf = annotationsDf[annotationsDf.type == 'CDS'][['id', 'name']].groupby(by=['name']).agg('count').reset_index().sort_values(by=['id'], ascending=False)
annotationsDf

Unnamed: 0,name,id
12588,ybjT,10
11187,cof,10
12558,xRE,10
12551,wbbJ,10
11300,dppC,10
...,...,...
4563,HHOEGC_01725,1
4564,HHOEGC_01740,1
4565,HHOEGC_01790,1
4566,HHOEGC_01810,1


In [14]:
annotationsDf[annotationsDf.id == annotationsDf.id.max()]

Unnamed: 0,name,id
12588,ybjT,10
11187,cof,10
12558,xRE,10
12551,wbbJ,10
11300,dppC,10
11355,fabG,10
11362,fadM,10
12407,tagH,10
11385,fepD,10
11442,gcvH,10
