# Perform Statistical Analysis for AMR genes

This analysis is based on: https://www.kaggle.com/code/hamelg/python-for-data-25-chi-squared-tests

## Get high risk patients

### Define a function to read data from the FHIR server

In [2]:
import json
import requests


def get(url):
    response = requests.get(
        url=url,
        headers={"Content-Type": "application/fhir+json", "authentication": "mjRmoNGW6klxaClkKhEkqi7HVYwx6NTH"},
    )
    return response


def readData(url):
    nextUrl = url
    data = []
    while(nextUrl):
        print('Reading URL: ', nextUrl)
        response = get(nextUrl)
        responseText = json.loads(response.text)
        data.append(responseText)
        nextUrl = None
        if 'link' in responseText:
            for link in responseText['link']:
                if link['relation'] == 'next':
                    nextUrl = link['url']
    return data

### Obtain high risk patient ids from FHIR

In [3]:
import itertools

[lowerRiskScore, higherRiskScore] = [0.5, 1.0]
query = 'http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge' + str(lowerRiskScore) + '&_has:RiskAssessment:subject:probability=le' + str(higherRiskScore)
response = readData(query)

highriskPatientIds = list(itertools.chain.from_iterable(list(map(lambda data: list(map(lambda entry: entry['resource']['id'][1:], data['entry'])), response))))
highriskPatientIds

Reading URL:  http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge0.5&_has:RiskAssessment:subject:probability=le1.0
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=0e7398ab-e927-4f59-bae7-cd4f3f07579b&_getpagesoffset=20&_count=20&_pretty=true&_bundletype=searchset


['745962',
 '2150228',
 '2142899',
 '677694',
 '2202086',
 '2172200',
 '546011',
 '1013210',
 '2103171',
 '2013664',
 '2120861',
 '2448944',
 '2153643',
 '1581023',
 '2140940',
 '631550',
 '2198313',
 '2161817',
 '2107492',
 '2141593',
 '549608',
 '2160210',
 '2092580',
 '2156000',
 '637422',
 '2199705']

## Map tube codes for high risk patients

In [4]:
import os

import pandas as pd


mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [5]:
highriskTubecodes = list(mappingDf[mappingDf.PATIENT_ID.isin([int(patientId) for patientId in highriskPatientIds])].tube_code)
highriskTubecodes

['AH20I057',
 'AH21C022',
 'AH18J081',
 'AH18J080',
 'AH19L077',
 'AH20F012',
 'AH19J028',
 'AH21A034',
 'AH20H023',
 'AH19K013',
 'AH20L008',
 'AH19E069',
 'AH20L085',
 'AH20E051',
 'AH20C005',
 'AH18K024',
 'AH20K063',
 'AH19C012',
 'AH19B014',
 'AH19A043',
 'AH19B026',
 'AH19C070',
 'AH20I042',
 'AH18J055',
 'AH19F055',
 'AH20G044',
 'AH21A036']

## Read annotations

In [6]:
import os

import pandas as pd


highriskAnnotationsDfList = []
controlAnnotationsDfList = []

gffDir = os.environ['GENOMICS_DATA_BASE'] + '/amrfinder'

for fileName in os.listdir(gffDir):

    tubeCode = fileName.split('.')[0].split('_')[0]

    amrResultsDf = pd.read_csv(
        gffDir + '/' + fileName,
        sep='\t',
    )
    amrResultsDf['tube_code'] = tubeCode

    if tubeCode in highriskTubecodes:
        highriskAnnotationsDfList.append(amrResultsDf)
    else:
        controlAnnotationsDfList.append(amrResultsDf)

highriskAnnotationsDf = pd.concat(highriskAnnotationsDfList, ignore_index=True)
controlAnnotationsDf = pd.concat(controlAnnotationsDfList, ignore_index=True)

highriskAnnotationsDf.shape, controlAnnotationsDf.shape

((597, 23), (57018, 23))

In [7]:
len(highriskAnnotationsDf.tube_code.unique()), len(controlAnnotationsDf.tube_code.unique())

(27, 2953)

## Perform chi-square tests

In [8]:
controlAnnotationsDf[['Element type', 'Element subtype']].drop_duplicates()

Unnamed: 0,Element type,Element subtype
0,VIRULENCE,VIRULENCE
3,AMR,AMR
7,STRESS,BIOCIDE
18,STRESS,METAL
29,STRESS,ACID
220,STRESS,HEAT


In [13]:
import scipy.stats as stats


# for annotationType in ['VIRULENCE', 'AMR', 'STRESS']:
for annotationType in ['AMR']:

    print('annotationType: ', annotationType)

    highriskGenecountsDf = highriskAnnotationsDf[highriskAnnotationsDf['Element type'] == annotationType][['Contig id', 'Gene symbol']].groupby(
            by=['Gene symbol']
        ).agg(
            'count'
        ).reset_index().rename(columns={'Contig id': 'high_risk_genes_count'})

    controlGenecountsDf = controlAnnotationsDf[controlAnnotationsDf['Element type'] == annotationType][['Contig id', 'Gene symbol']].groupby(
            by=['Gene symbol']
        ).agg(
            'count'
        ).reset_index().rename(columns={'Contig id': 'control_genes_count'})

    mergedGenecountsDf = controlGenecountsDf.merge(
        highriskGenecountsDf,
        how='left',
        on=['Gene symbol']
    ).fillna(0)

    mergedGenecountsDf['control_genes_proportion'] = mergedGenecountsDf.control_genes_count/mergedGenecountsDf.control_genes_count.sum()

    mergedGenecountsDf['expected_genes_count'] = mergedGenecountsDf.control_genes_proportion * mergedGenecountsDf.high_risk_genes_count.sum()

    print(stats.chisquare(f_obs= mergedGenecountsDf.high_risk_genes_count, f_exp= mergedGenecountsDf.expected_genes_count))

annotationType:  AMR
Power_divergenceResult(statistic=735.394986353418, pvalue=2.4268692591733216e-15)


In [14]:
mergedGenecountsDf['percentage_difference'] = (mergedGenecountsDf.expected_genes_count - mergedGenecountsDf.high_risk_genes_count)/mergedGenecountsDf.expected_genes_count * 100
mergedGenecountsDf

Unnamed: 0,Gene symbol,control_genes_count,high_risk_genes_count,control_genes_proportion,expected_genes_count,percentage_difference
0,aac(3)-IId,76,0.0,0.004225,0.612665,100.0
1,aac(3)-IIe,19,0.0,0.001056,0.153166,100.0
2,aac(3)-IIg,1,0.0,0.000056,0.008061,100.0
3,aac(3)-IVa,1,0.0,0.000056,0.008061,100.0
4,aac(6'),36,0.0,0.002001,0.290210,100.0
...,...,...,...,...,...,...
453,vanX-B,135,0.0,0.007505,1.088286,100.0
454,vanXY-C,8,0.0,0.000445,0.064491,100.0
455,vanY-A,20,0.0,0.001112,0.161228,100.0
456,vanY-B,134,0.0,0.007450,1.080225,100.0


In [15]:
mergedGenecountsDf[mergedGenecountsDf.high_risk_genes_count == 0]

Unnamed: 0,Gene symbol,control_genes_count,high_risk_genes_count,control_genes_proportion,expected_genes_count,percentage_difference
0,aac(3)-IId,76,0.0,0.004225,0.612665,100.0
1,aac(3)-IIe,19,0.0,0.001056,0.153166,100.0
2,aac(3)-IIg,1,0.0,0.000056,0.008061,100.0
3,aac(3)-IVa,1,0.0,0.000056,0.008061,100.0
4,aac(6'),36,0.0,0.002001,0.290210,100.0
...,...,...,...,...,...,...
453,vanX-B,135,0.0,0.007505,1.088286,100.0
454,vanXY-C,8,0.0,0.000445,0.064491,100.0
455,vanY-A,20,0.0,0.001112,0.161228,100.0
456,vanY-B,134,0.0,0.007450,1.080225,100.0


In [16]:
mergedGenecountsDf[mergedGenecountsDf.high_risk_genes_count != 0].sort_values(by=['percentage_difference'])

Unnamed: 0,Gene symbol,control_genes_count,high_risk_genes_count,control_genes_proportion,expected_genes_count,percentage_difference
341,fusC,12,2.0,0.000667,0.096737,-1967.471264
428,tet(K),19,2.0,0.001056,0.153166,-1205.771325
357,mecA,85,6.0,0.004726,0.685217,-775.634888
207,blaPC1,65,4.0,0.003614,0.52399,-663.374005
363,mepA,535,26.0,0.029744,4.312837,-502.851434
420,tet(38),535,26.0,0.029744,4.312837,-502.851434
359,mecR1,64,3.0,0.003558,0.515928,-481.476293
109,blaI,451,21.0,0.025074,3.635681,-477.60838
288,blaZ,379,17.0,0.021071,3.055262,-456.417069
244,blaR1,313,14.0,0.017401,2.523211,-454.848518
