# Perform Statistical Analysis for AMR genes

This analysis is based on: https://www.kaggle.com/code/hamelg/python-for-data-25-chi-squared-tests

## Get low risk patients

### Define a function to read data from the FHIR server

In [1]:
import json
import requests


def get(url):
    response = requests.get(
        url=url,
        headers={"Content-Type": "application/fhir+json", "authentication": "mjRmoNGW6klxaClkKhEkqi7HVYwx6NTH"},
    )
    return response


def readData(url):
    nextUrl = url
    data = []
    while(nextUrl):
        print('Reading URL: ', nextUrl)
        response = get(nextUrl)
        responseText = json.loads(response.text)
        data.append(responseText)
        nextUrl = None
        if 'link' in responseText:
            for link in responseText['link']:
                if link['relation'] == 'next':
                    nextUrl = link['url']
    return data

### Obtain high risk patient ids from FHIR

In [2]:
import itertools

[lowerRiskScore, higherRiskScore] = [0.0, 0.1]
query = 'http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge' + str(lowerRiskScore) + '&_has:RiskAssessment:subject:probability=le' + str(higherRiskScore)
response = readData(query)

lowriskPatientIds = list(itertools.chain.from_iterable(list(map(lambda data: list(map(lambda entry: entry['resource']['id'][1:], data['entry'])), response))))
lowriskPatientIds

Reading URL:  http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge0.0&_has:RiskAssessment:subject:probability=le0.1
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=eccfe6ab-d57d-4b9b-a614-dfe0e9767be8&_getpagesoffset=20&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=eccfe6ab-d57d-4b9b-a614-dfe0e9767be8&_getpagesoffset=40&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=eccfe6ab-d57d-4b9b-a614-dfe0e9767be8&_getpagesoffset=60&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=eccfe6ab-d57d-4b9b-a614-dfe0e9767be8&_getpagesoffset=80&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=eccfe6ab-d57d-4b9b-a614-dfe0e9767be8&_getpagesoffset=100&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=eccfe6ab-d57d-4b9b-a614-df

['2092159',
 '1710435',
 '2301151',
 '745962',
 '2125063',
 '1855696',
 '2150228',
 '2207219',
 '1023735',
 '1012033',
 '2085279',
 '2263467',
 '640993',
 '2198232',
 '1922539',
 '2228090',
 '2185151',
 '2271689',
 '2190548',
 '2218413',
 '2239222',
 '2208540',
 '2233335',
 '2454570',
 '677694',
 '2297160',
 '2115221',
 '2199146',
 '2183249',
 '1584203',
 '2208200',
 '2098703',
 '2202499',
 '2130908',
 '110792',
 '2144610',
 '2217919',
 '2142899',
 '2126898',
 '2109854',
 '2105752',
 '2080376',
 '2130121',
 '360621',
 '1356357',
 '2161817',
 '2191641',
 '2440791',
 '2133327',
 '2105813',
 '2248990',
 '2384522',
 '2374110',
 '2164122',
 '2526936',
 '1646600',
 '2501276',
 '2141952',
 '2134257',
 '2421109',
 '2221447',
 '2144001',
 '2236567',
 '2010485',
 '2252017',
 '1804919',
 '1536492',
 '2164891',
 '2526402',
 '2170155',
 '2213577',
 '2013664',
 '2106126',
 '2120861',
 '2126985',
 '2155539',
 '2480818',
 '1025623',
 '2044946',
 '2205992',
 '2188910',
 '2082838',
 '2116081',
 '2206239

## Map tube codes for high risk patients

In [3]:
import os

import pandas as pd


mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [4]:
lowriskTubecodes = list(mappingDf[mappingDf.PATIENT_ID.isin([int(patientId) for patientId in lowriskPatientIds])].tube_code)
lowriskTubecodes

['AH20L041',
 'AH20C048',
 'AH18K050',
 'AH20J071',
 'AH19G021',
 'AH20A031',
 'AH21E085',
 'AH21G070',
 'AH20I016',
 'AH21A081',
 'AH21E065',
 'AH20I005',
 'AH20J009',
 'KPN2214',
 'AH21B002',
 'AH19K005',
 'AH21H055',
 'AH20G075',
 'AH20A054',
 'AH21A083',
 'AH19J022',
 'AH20J052',
 'AH21G019',
 'AH20I057',
 'AH21C022',
 'AH20F085',
 'AH18J081',
 'AH18J080',
 'AH18J065',
 'AH20F067',
 'AH21A011',
 'AH18K044',
 'AH19L077',
 'AH21A043',
 'AH20F012',
 'AH18K062',
 'AH21C014',
 'AH21H007',
 'AH21H060',
 'AH19J028',
 'AH20C034',
 'AH20H038',
 'AH20C060',
 'AH19L044',
 'AH21E069',
 'AH21B039',
 'AH21A010',
 'AH21E001',
 'AH21A075',
 'AH20I050',
 'AH20A021',
 'AH20A022',
 'AH21A034',
 'AH19I035',
 'AH21D030',
 'AH19I040',
 'AH20L029',
 'AH20D059',
 'AH20D058',
 'AH20J031',
 'AH20L010',
 'AH21E083',
 'AH21H015',
 'AH19H053',
 'AH20E048',
 'AH20H023',
 'AH20G045',
 'AH20H053',
 'AH21G032',
 'AH20G069',
 'AH20G070',
 'AH21B028',
 'AH19J034',
 'AH20I032',
 'AH21D047',
 'AH21D049',
 'AH20A048',


## Read annotations

In [5]:
import os

import pandas as pd


lowriskAnnotationsDfList = []
controlAnnotationsDfList = []

gffDir = os.environ['GENOMICS_DATA_BASE'] + '/amrfinder'

for fileName in os.listdir(gffDir):

    tubeCode = fileName.split('.')[0].split('_')[0]

    amrResultsDf = pd.read_csv(
        gffDir + '/' + fileName,
        sep='\t',
    )
    amrResultsDf['tube_code'] = tubeCode

    if tubeCode in lowriskTubecodes:
        lowriskAnnotationsDfList.append(amrResultsDf)
    else:
        controlAnnotationsDfList.append(amrResultsDf)

lowriskAnnotationsDf = pd.concat(lowriskAnnotationsDfList, ignore_index=True)
controlAnnotationsDf = pd.concat(controlAnnotationsDfList, ignore_index=True)

lowriskAnnotationsDf.shape, controlAnnotationsDf.shape

((8644, 23), (48971, 23))

In [6]:
len(lowriskAnnotationsDf.tube_code.unique()), len(controlAnnotationsDf.tube_code.unique())

(383, 2597)

## Perform chi-square tests

In [7]:
controlAnnotationsDf[['Element type', 'Element subtype']].drop_duplicates()

Unnamed: 0,Element type,Element subtype
0,VIRULENCE,VIRULENCE
3,AMR,AMR
7,STRESS,BIOCIDE
18,STRESS,METAL
29,STRESS,ACID
220,STRESS,HEAT


In [None]:
import scipy.stats as stats


for annotationType in ['VIRULENCE', 'AMR', 'STRESS']:

    print('annotationType: ', annotationType)

    highriskGenecountsDf = lowriskAnnotationsDf[lowriskAnnotationsDf['Element type'] == annotationType][['Contig id', 'Gene symbol']].groupby(
            by=['Gene symbol']
        ).agg(
            'count'
        ).reset_index().rename(columns={'Contig id': 'low_risk_genes_count'})

    controlGenecountsDf = controlAnnotationsDf[controlAnnotationsDf['Element type'] == annotationType][['Contig id', 'Gene symbol']].groupby(
            by=['Gene symbol']
        ).agg(
            'count'
        ).reset_index().rename(columns={'Contig id': 'control_genes_count'})

    mergedGenecountsDf = controlGenecountsDf.merge(
        highriskGenecountsDf,
        how='left',
        on=['Gene symbol']
    ).fillna(0)

    mergedGenecountsDf['control_genes_proportion'] = mergedGenecountsDf.control_genes_count/mergedGenecountsDf.control_genes_count.sum()

    mergedGenecountsDf['expected_genes_count'] = mergedGenecountsDf.control_genes_proportion * mergedGenecountsDf.low_risk_genes_count.sum()

    print(stats.chisquare(f_obs= mergedGenecountsDf.low_risk_genes_count, f_exp= mergedGenecountsDf.expected_genes_count))

annotationType:  VIRULENCE
Power_divergenceResult(statistic=21519.06579818226, pvalue=0.0)
annotationType:  AMR
Power_divergenceResult(statistic=14401.686572626739, pvalue=0.0)
annotationType:  STRESS
Power_divergenceResult(statistic=9503.730140594736, pvalue=0.0)
