# Perform Statistical Analysis for AMR genes

This analysis is based on: https://www.kaggle.com/code/hamelg/python-for-data-25-chi-squared-tests

## Get low risk patients

### Define a function to read data from the FHIR server

In [1]:
import json
import requests


def get(url):
    response = requests.get(
        url=url,
        headers={"Content-Type": "application/fhir+json", "authentication": "mjRmoNGW6klxaClkKhEkqi7HVYwx6NTH"},
    )
    return response


def readData(url):
    nextUrl = url
    data = []
    while(nextUrl):
        print('Reading URL: ', nextUrl)
        response = get(nextUrl)
        responseText = json.loads(response.text)
        data.append(responseText)
        nextUrl = None
        if 'link' in responseText:
            for link in responseText['link']:
                if link['relation'] == 'next':
                    nextUrl = link['url']
    return data

### Obtain high risk patient ids from FHIR

In [2]:
import itertools

[lowerRiskScore, higherRiskScore] = [0.0, 0.5]
query = 'http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge' + str(lowerRiskScore) + '&_has:RiskAssessment:subject:probability=le' + str(higherRiskScore)
response = readData(query)

lowriskPatientIds = list(itertools.chain.from_iterable(list(map(lambda data: list(map(lambda entry: entry['resource']['id'][1:], data['entry'])), response))))
lowriskPatientIds

Reading URL:  http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge0.0&_has:RiskAssessment:subject:probability=le0.5
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=84971c3b-b81d-42dd-9876-1f8a720675cf&_getpagesoffset=20&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=84971c3b-b81d-42dd-9876-1f8a720675cf&_getpagesoffset=40&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=84971c3b-b81d-42dd-9876-1f8a720675cf&_getpagesoffset=60&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=84971c3b-b81d-42dd-9876-1f8a720675cf&_getpagesoffset=80&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=84971c3b-b81d-42dd-9876-1f8a720675cf&_getpagesoffset=100&_count=20&_pretty=true&_bundletype=searchset
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=84971c3b-b81d-42dd-9876-1f

['2092159',
 '1710435',
 '1023735',
 '745962',
 '2207219',
 '2144610',
 '2301151',
 '2125063',
 '1855696',
 '2190548',
 '2150228',
 '2142899',
 '2263467',
 '640993',
 '2198232',
 '1922539',
 '2228090',
 '2185151',
 '677694',
 '2271689',
 '2297160',
 '2115221',
 '2199146',
 '2217919',
 '2218413',
 '1584203',
 '2239222',
 '2208540',
 '2233335',
 '2126898',
 '2161817',
 '2454570',
 '1012033',
 '2109854',
 '2105752',
 '2080376',
 '2130121',
 '360621',
 '1356357',
 '2085279',
 '2191641',
 '2440791',
 '2133327',
 '2105813',
 '2248990',
 '2384522',
 '2374110',
 '2164122',
 '2526936',
 '1646600',
 '2501276',
 '2141952',
 '2134257',
 '1025623',
 '2166951',
 '2421109',
 '2202499',
 '2130908',
 '2221447',
 '110792',
 '2144001',
 '2236567',
 '2010485',
 '2252017',
 '1804919',
 '1536492',
 '2164891',
 '2526402',
 '2170155',
 '2044946',
 '2205992',
 '2188910',
 '2082838',
 '2117184',
 '2183249',
 '2116081',
 '2206239',
 '2110697',
 '1881022',
 '2084529',
 '1889861',
 '2223520',
 '637422',
 '2042123'

## Map tube codes for low risk patients

In [3]:
import os

import pandas as pd


mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [4]:
lowriskTubecodes = list(mappingDf[mappingDf.PATIENT_ID.isin([int(patientId) for patientId in lowriskPatientIds])].tube_code)
lowriskTubecodes

['AH20L041',
 'AH20C048',
 'AH18K050',
 'AH20J071',
 'AH19G021',
 'AH20A031',
 'AH21E085',
 'AH21G070',
 'AH20I016',
 'AH21A081',
 'AH21E065',
 'AH20I005',
 'AH20J009',
 'KPN2214',
 'AH21B002',
 'AH19K005',
 'AH21H055',
 'AH20G075',
 'AH20A054',
 'AH21A083',
 'AH19J022',
 'AH20J052',
 'AH21G019',
 'AH20I057',
 'AH21C022',
 'AH20F085',
 'AH18J081',
 'AH18J080',
 'AH18J065',
 'AH20F067',
 'AH21A011',
 'AH18K044',
 'AH19L077',
 'AH21A043',
 'AH20F012',
 'AH18K062',
 'AH21C014',
 'AH21H007',
 'AH21H060',
 'AH19J028',
 'AH20C034',
 'AH20H038',
 'AH20C060',
 'AH19L044',
 'AH21E069',
 'AH21B039',
 'AH21A010',
 'AH21E001',
 'AH21A075',
 'AH20I050',
 'AH20A021',
 'AH20A022',
 'AH21A034',
 'AH19I035',
 'AH21D030',
 'AH19I040',
 'AH20L029',
 'AH20D059',
 'AH20D058',
 'AH20J031',
 'AH20L010',
 'AH21E083',
 'AH21H015',
 'AH19H053',
 'AH20E048',
 'AH20H023',
 'AH20G045',
 'AH20H053',
 'AH21G032',
 'AH20G069',
 'AH20G070',
 'AH21B028',
 'AH19J034',
 'AH20I032',
 'AH21D047',
 'AH21D049',
 'AH20A048',


## Read annotations

In [5]:
import os

import pandas as pd


lowriskAnnotationsDfList = []
controlAnnotationsDfList = []

gffDir = os.environ['GENOMICS_DATA_BASE'] + '/amrfinder'

for fileName in os.listdir(gffDir):

    tubeCode = fileName.split('.')[0].split('_')[0]

    amrResultsDf = pd.read_csv(
        gffDir + '/' + fileName,
        sep='\t',
    )
    amrResultsDf['tube_code'] = tubeCode

    if tubeCode in lowriskTubecodes:
        lowriskAnnotationsDfList.append(amrResultsDf)
    else:
        controlAnnotationsDfList.append(amrResultsDf)

lowriskAnnotationsDf = pd.concat(lowriskAnnotationsDfList, ignore_index=True)
controlAnnotationsDf = pd.concat(controlAnnotationsDfList, ignore_index=True)

lowriskAnnotationsDf.shape, controlAnnotationsDf.shape

((8644, 23), (48971, 23))

In [6]:
len(lowriskAnnotationsDf.tube_code.unique()), len(controlAnnotationsDf.tube_code.unique())

(383, 2597)

## Perform chi-square tests

In [7]:
controlAnnotationsDf[['Element type', 'Element subtype']].drop_duplicates()

Unnamed: 0,Element type,Element subtype
0,VIRULENCE,VIRULENCE
3,AMR,AMR
7,STRESS,BIOCIDE
18,STRESS,METAL
29,STRESS,ACID
220,STRESS,HEAT


In [8]:
import scipy.stats as stats


dfDict = {}

for annotationType in ['VIRULENCE', 'AMR', 'STRESS']:

    print('annotationType: ', annotationType)

    highriskGenecountsDf = lowriskAnnotationsDf[lowriskAnnotationsDf['Element type'] == annotationType][['Contig id', 'Gene symbol']].groupby(
            by=['Gene symbol']
        ).agg(
            'count'
        ).reset_index().rename(columns={'Contig id': 'low_risk_genes_count'})

    controlGenecountsDf = controlAnnotationsDf[controlAnnotationsDf['Element type'] == annotationType][['Contig id', 'Gene symbol']].groupby(
            by=['Gene symbol']
        ).agg(
            'count'
        ).reset_index().rename(columns={'Contig id': 'control_genes_count'})

    mergedGenecountsDf = controlGenecountsDf.merge(
        highriskGenecountsDf,
        how='left',
        on=['Gene symbol']
    ).fillna(0)

    filteredGenecountsDf = mergedGenecountsDf[(mergedGenecountsDf.low_risk_genes_count >= 5) & (mergedGenecountsDf.low_risk_genes_count > 0)]

    filteredGenecountsDf['control_genes_proportion'] = filteredGenecountsDf.control_genes_count/filteredGenecountsDf.control_genes_count.sum()

    filteredGenecountsDf['expected_genes_count'] = filteredGenecountsDf.control_genes_proportion * filteredGenecountsDf.low_risk_genes_count.sum()

    dfDict[annotationType] = filteredGenecountsDf

    if(filteredGenecountsDf.shape[0] < 2):
        print('Not sufficient data for the test')
        continue

    chi2, p = stats.chisquare(f_obs=filteredGenecountsDf.low_risk_genes_count, f_exp=filteredGenecountsDf.expected_genes_count)
    significant = p < 0.05  # 5% significance level
    print(chi2, p, significant)

annotationType:  VIRULENCE
13423.385443372055 0.0 True
annotationType:  AMR
10118.152242650896 0.0 True
annotationType:  STRESS
9003.92236237017 0.0 True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredGenecountsDf['control_genes_proportion'] = filteredGenecountsDf.control_genes_count/filteredGenecountsDf.control_genes_count.sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredGenecountsDf['expected_genes_count'] = filteredGenecountsDf.control_genes_proportion * filteredGenecountsDf.low_risk_genes_count.sum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

In [9]:
dfDict['VIRULENCE']

Unnamed: 0,Gene symbol,control_genes_count,low_risk_genes_count,control_genes_proportion,expected_genes_count
16,aur,231,330.0,0.018871,97.732947
22,cna,22,43.0,0.001797,9.3079
28,ednB,6,7.0,0.00049,2.538518
37,eta,6,7.0,0.00049,2.538518
41,fdeC,847,10.0,0.069194,358.354138
44,hld,231,330.0,0.018871,97.732947
45,hlgA,227,330.0,0.018544,96.040601
46,hlgB,227,330.0,0.018544,96.040601
47,hlgC,227,329.0,0.018544,96.040601
51,icaC,232,331.0,0.018953,98.156033


In [10]:
dfDict['AMR']

Unnamed: 0,Gene symbol,control_genes_count,low_risk_genes_count,control_genes_proportion,expected_genes_count
5,aac(6')-I,216,6.0,0.022568,45.74569
15,aac(6')-Ie/aph(2'')-Ia,113,15.0,0.011806,23.931773
30,acrF,869,10.0,0.090795,184.041688
40,ant(9)-Ia,68,15.0,0.007105,14.401421
41,aph(3'')-Ib,265,8.0,0.027688,56.123185
42,aph(3')-IIIa,53,5.0,0.005538,11.224637
49,aph(6)-Id,267,8.0,0.027897,56.546756
106,blaEC,665,8.0,0.069481,140.837426
108,blaI,192,280.0,0.020061,40.662836
206,blaPC1,26,43.0,0.002717,5.506426


In [11]:
dfDict['STRESS']

Unnamed: 0,Gene symbol,control_genes_count,low_risk_genes_count,control_genes_proportion,expected_genes_count
0,arsA,162,5.0,0.01295,14.503597
1,arsB,223,67.0,0.017826,19.964828
2,arsC,1162,77.0,0.092886,104.031974
3,arsD,236,8.0,0.018865,21.128697
6,arsR,379,65.0,0.030296,33.931255
7,asr,864,10.0,0.069065,77.352518
12,cadD,145,205.0,0.011591,12.981615
16,copB,233,7.0,0.018625,20.860112
20,emrE,842,13.0,0.067306,75.382894
21,fieF,1422,29.0,0.113669,127.309353


## Old code

In [12]:
mergedGenecountsDf['percentage_difference'] = (mergedGenecountsDf.expected_genes_count - mergedGenecountsDf.low_risk_genes_count)/mergedGenecountsDf.expected_genes_count * 100
mergedGenecountsDf

AttributeError: 'DataFrame' object has no attribute 'expected_genes_count'

In [None]:
mergedGenecountsDf.sort_values(by=['percentage_difference'])[:20]

Unnamed: 0,Gene symbol,control_genes_count,low_risk_genes_count,control_genes_proportion,expected_genes_count,percentage_difference
26,mco,21,39.0,0.001513,1.75479,-2122.487685
52,qacA,10,16.0,0.00072,0.835614,-1814.758621
25,lmrS,212,304.0,0.015272,17.715027,-1616.057254
12,cadD,145,205.0,0.010445,12.11641,-1591.920333
62,qacR,13,17.0,0.000936,1.086299,-1464.94695
60,qacJ,1,1.0,7.2e-05,0.083561,-1096.724138
54,qacC,27,17.0,0.001945,2.256159,-653.492976
28,merB,8,3.0,0.000576,0.668492,-348.771552
11,cadC,6,2.0,0.000432,0.501369,-298.908046
53,qacB,3,1.0,0.000216,0.250684,-298.908046


In [None]:
mergedGenecountsDf[mergedGenecountsDf.low_risk_genes_count > 0].sort_values(by=['percentage_difference'], ascending=False)[:20]

Unnamed: 0,Gene symbol,control_genes_count,low_risk_genes_count,control_genes_proportion,expected_genes_count,percentage_difference
82,terZ,99,1.0,0.007132,8.272583,87.911877
7,asr,864,10.0,0.062239,72.19709,86.149026
86,ymgB,838,10.0,0.060366,70.024492,85.719282
79,terD,158,2.0,0.011382,13.202709,84.851593
14,clpK,66,1.0,0.004754,5.515055,81.867816
20,emrE,842,13.0,0.060654,70.358738,81.523262
21,fieF,1422,29.0,0.102435,118.824377,75.594233
48,pcoE,143,3.0,0.010301,11.949287,74.893899
75,ssmE,39,1.0,0.002809,3.258896,69.314766
63,sdeA,39,1.0,0.002809,3.258896,69.314766


In [None]:
mergedGenecountsDf[mergedGenecountsDf.low_risk_genes_count == 0]

Unnamed: 0,Gene symbol,control_genes_count,low_risk_genes_count,control_genes_proportion,expected_genes_count,percentage_difference
4,arsD2,1,0.0,7.2e-05,0.083561,100.0
5,arsH,1,0.0,7.2e-05,0.083561,100.0
8,bcrB,2,0.0,0.000144,0.167123,100.0
9,bcrC,2,0.0,0.000144,0.167123,100.0
10,cadA,6,0.0,0.000432,0.501369,100.0
13,chrA,1,0.0,7.2e-05,0.083561,100.0
19,dpsA,2,0.0,0.000144,0.167123,100.0
40,nirA,2,0.0,0.000144,0.167123,100.0
41,nirB,3,0.0,0.000216,0.250684,100.0
42,nirD,3,0.0,0.000216,0.250684,100.0
