# Perform Statistical Analysis for AMR genes

This analysis is based on: https://www.kaggle.com/code/hamelg/python-for-data-25-chi-squared-tests

## Get high risk patients

### Define a function to read data from the FHIR server

In [1]:
import json
import requests


def get(url):
    response = requests.get(
        url=url,
        headers={"Content-Type": "application/fhir+json", "authentication": "mjRmoNGW6klxaClkKhEkqi7HVYwx6NTH"},
    )
    return response


def readData(url):
    nextUrl = url
    data = []
    while(nextUrl):
        print('Reading URL: ', nextUrl)
        response = get(nextUrl)
        responseText = json.loads(response.text)
        data.append(responseText)
        nextUrl = None
        if 'link' in responseText:
            for link in responseText['link']:
                if link['relation'] == 'next':
                    nextUrl = link['url']
    return data

### Obtain high risk patient ids from FHIR

In [2]:
import itertools

[lowerRiskScore, higherRiskScore] = [0.5, 1.0]
query = 'http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge' + str(lowerRiskScore) + '&_has:RiskAssessment:subject:probability=le' + str(higherRiskScore)
response = readData(query)

highriskPatientIds = list(itertools.chain.from_iterable(list(map(lambda data: list(map(lambda entry: entry['resource']['id'][1:], data['entry'])), response))))
highriskPatientIds

Reading URL:  http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge0.5&_has:RiskAssessment:subject:probability=le1.0
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=860232a4-85eb-4fe9-acb6-980eadc65557&_getpagesoffset=20&_count=20&_pretty=true&_bundletype=searchset


['2202086',
 '2172200',
 '546011',
 '1013210',
 '2103171',
 '2013664',
 '2120861',
 '2142899',
 '677694',
 '2448944',
 '2150228',
 '745962',
 '2153643',
 '1581023',
 '2140940',
 '631550',
 '2198313',
 '2161817',
 '2107492',
 '2141593',
 '549608',
 '2160210',
 '637422',
 '2092580',
 '2156000',
 '2199705']

## Map tube codes for high risk patients

In [3]:
import os

import pandas as pd


mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [5]:
highriskTubecodes = list(mappingDf[mappingDf.PATIENT_ID.isin([int(patientId) for patientId in highriskPatientIds])].tube_code)
highriskTubecodes

['AH20I057',
 'AH21C022',
 'AH18J081',
 'AH18J080',
 'AH19L077',
 'AH20F012',
 'AH19J028',
 'AH21A034',
 'AH20H023',
 'AH19K013',
 'AH20L008',
 'AH19E069',
 'AH20L085',
 'AH20E051',
 'AH20C005',
 'AH18K024',
 'AH20K063',
 'AH19C012',
 'AH19B014',
 'AH19A043',
 'AH19B026',
 'AH19C070',
 'AH20I042',
 'AH18J055',
 'AH19F055',
 'AH20G044',
 'AH21A036']

## Read annotations

In [7]:
import os

import pandas as pd


highriskAnnotationsDfList = []
controlAnnotationsDfList = []

gffDir = os.environ['GENOMICS_DATA_BASE'] + '/amrfinder'

for fileName in os.listdir(gffDir):

    tubeCode = fileName.split('.')[0].split('_')[0]

    amrResultsDf = pd.read_csv(
        gffDir + '/' + fileName,
        sep='\t',
    )
    amrResultsDf['tube_code'] = tubeCode

    if tubeCode in highriskTubecodes:
        highriskAnnotationsDfList.append(amrResultsDf)
    else:
        controlAnnotationsDfList.append(amrResultsDf)

highriskAnnotationsDf = pd.concat(highriskAnnotationsDfList, ignore_index=True)
controlAnnotationsDf = pd.concat(controlAnnotationsDfList, ignore_index=True)

highriskAnnotationsDf.shape, controlAnnotationsDf.shape

((597, 23), (57018, 23))

In [8]:
len(highriskAnnotationsDf.tube_code.unique()), len(controlAnnotationsDf.tube_code.unique())

(27, 2953)

## Perform chi-square tests

In [20]:
controlAnnotationsDf[['Element type', 'Element subtype']].drop_duplicates()

Unnamed: 0,Element type,Element subtype
0,VIRULENCE,VIRULENCE
3,AMR,AMR
7,STRESS,BIOCIDE
18,STRESS,METAL
29,STRESS,ACID
220,STRESS,HEAT


In [24]:
import scipy.stats as stats


for annotationType in ['VIRULENCE', 'AMR', 'STRESS']:

    print('annotationType: ', annotationType)

    highriskGenecountsDf = highriskAnnotationsDf[highriskAnnotationsDf['Element type'] == annotationType][['Contig id', 'Gene symbol']].groupby(
            by=['Gene symbol']
        ).agg(
            'count'
        ).reset_index().rename(columns={'Contig id': 'high_risk_genes_count'})

    controlGenecountsDf = controlAnnotationsDf[controlAnnotationsDf['Element type'] == annotationType][['Contig id', 'Gene symbol']].groupby(
            by=['Gene symbol']
        ).agg(
            'count'
        ).reset_index().rename(columns={'Contig id': 'control_genes_count'})

    mergedGenecountsDf = controlGenecountsDf.merge(
        highriskGenecountsDf,
        how='left',
        on=['Gene symbol']
    ).fillna(0)

    mergedGenecountsDf['control_genes_proportion'] = mergedGenecountsDf.control_genes_count/mergedGenecountsDf.control_genes_count.sum()

    mergedGenecountsDf['expected_genes_count'] = mergedGenecountsDf.control_genes_proportion * mergedGenecountsDf.high_risk_genes_count.sum()

    print(stats.chisquare(f_obs= mergedGenecountsDf.high_risk_genes_count, f_exp= mergedGenecountsDf.expected_genes_count))

annotationType:  VIRULENCE
Power_divergenceResult(statistic=925.1579718262378, pvalue=3.939647678546047e-117)
annotationType:  AMR
Power_divergenceResult(statistic=735.394986353418, pvalue=2.4268692591733216e-15)
annotationType:  STRESS
Power_divergenceResult(statistic=513.8663093332209, pvalue=8.865157499001861e-62)
