# Perform Statistical Analysis

This analysis is based on: https://www.kaggle.com/code/hamelg/python-for-data-25-chi-squared-tests

## Get high risk patients

### Define a function to read data from the FHIR server

In [2]:
import json
import requests


def get(url):
    response = requests.get(
        url=url,
        headers={"Content-Type": "application/fhir+json", "authentication": "mjRmoNGW6klxaClkKhEkqi7HVYwx6NTH"},
    )
    return response


def readData(url):
    nextUrl = url
    data = []
    while(nextUrl):
        print('Reading URL: ', nextUrl)
        response = get(nextUrl)
        responseText = json.loads(response.text)
        data.append(responseText)
        nextUrl = None
        if 'link' in responseText:
            for link in responseText['link']:
                if link['relation'] == 'next':
                    nextUrl = link['url']
    return data

### Obtain high risk patient ids from FHIR

In [3]:
import itertools

[lowerRiskScore, higherRiskScore] = [0.5, 1.0]
query = 'http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge' + str(lowerRiskScore) + '&_has:RiskAssessment:subject:probability=le' + str(higherRiskScore)
response = readData(query)

highriskPatientIds = list(itertools.chain.from_iterable(list(map(lambda data: list(map(lambda entry: entry['resource']['id'][1:], data['entry'])), response))))
highriskPatientIds

Reading URL:  http://10.172.235.4:8080/fhir/Patient?_has:RiskAssessment:subject:probability=ge0.5&_has:RiskAssessment:subject:probability=le1.0
Reading URL:  http://10.172.235.4:8080/fhir?_getpages=18c36b32-e923-4f39-9f23-c1e2b503330a&_getpagesoffset=20&_count=20&_pretty=true&_bundletype=searchset


['745962',
 '2150228',
 '2142899',
 '677694',
 '2202086',
 '2172200',
 '546011',
 '1013210',
 '2103171',
 '2013664',
 '2120861',
 '2448944',
 '2153643',
 '1581023',
 '2140940',
 '631550',
 '2198313',
 '2161817',
 '2107492',
 '2141593',
 '549608',
 '2160210',
 '2092580',
 '2156000',
 '637422',
 '2199705']

## Map tube codes for high risk patients

In [4]:
import os

import pandas as pd


mappingDf = pd.read_csv(os.environ['GENOMICS_DATA_BASE'] + '/patient_tube_id_mapping_full.tsv', sep='\t')
mappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


In [5]:
highriskTubecodes = list(mappingDf[mappingDf.PATIENT_ID.isin([int(patientId) for patientId in highriskPatientIds])].tube_code)
highriskTubecodes

['AH20I057',
 'AH21C022',
 'AH18J081',
 'AH18J080',
 'AH19L077',
 'AH20F012',
 'AH19J028',
 'AH21A034',
 'AH20H023',
 'AH19K013',
 'AH20L008',
 'AH19E069',
 'AH20L085',
 'AH20E051',
 'AH20C005',
 'AH18K024',
 'AH20K063',
 'AH19C012',
 'AH19B014',
 'AH19A043',
 'AH19B026',
 'AH19C070',
 'AH20I042',
 'AH18J055',
 'AH19F055',
 'AH20G044',
 'AH21A036']

## Read annotations

In [6]:
import os

import pandas as pd


highriskAnnotationsDfList = []
controlAnnotationsDfList = []

gffDir = os.environ['GENOMICS_DATA_BASE'] + '/annotations/s_aureus_gff3'

for fileName in os.listdir(gffDir):

    tubeCode = fileName.split('.')[0]

    gffDf = pd.read_csv(
        gffDir + '/' + fileName,
        sep='\t',
        comment='#',
        names=['sequence_id', 'source', 'feature_type', 'feature_start', 'feature_end', 'score', 'strand', 'phase', 'atributes']
    )
    gffDf['id'] = gffDf.atributes.apply(lambda x: [att.split('=')[1] for att in x.split(';') if att.split('=')[0] == 'ID']).apply(lambda x: None if (len(x) == 0) else x[0])
    gffDf['name'] = gffDf.atributes.apply(lambda x: [att.split('=')[1] for att in x.split(';') if att.split('=')[0] == 'Name']).apply(lambda x: None if (len(x) == 0) else x[0])
    gffDf['gene'] = gffDf.atributes.apply(lambda x: [att.split('=')[1] for att in x.split(';') if att.split('=')[0] == 'gene']).apply(lambda x: None if (len(x) == 0) else x[0])
    gffDf['tube_code'] = tubeCode

    if tubeCode in highriskTubecodes:
        highriskAnnotationsDfList.append(gffDf)
    else:
        controlAnnotationsDfList.append(gffDf)

highriskAnnotationsDf = pd.concat(highriskAnnotationsDfList, ignore_index=True)
controlAnnotationsDf = pd.concat(controlAnnotationsDfList, ignore_index=True)

highriskAnnotationsDf.shape, controlAnnotationsDf.shape

((72545, 13), (1477006, 13))

In [7]:
len(highriskAnnotationsDf.tube_code.unique()), len(controlAnnotationsDf.tube_code.unique())

(26, 531)

## Perform chi-square tests

In [8]:
import scipy.stats as stats


dfDict = {}

for annotationType in ['CDS', 'ncRNA', 'rRNA', 'tRNA', 'tmRNA', 'regulatory_region']:

    print('annotationType: ', annotationType)

    highriskGenecountsDf = highriskAnnotationsDf[highriskAnnotationsDf.feature_type == annotationType][['id', 'gene']].groupby(
            by=['gene']
        ).agg(
            'count'
        ).reset_index().rename(columns={'id': 'high_risk_genes_count'})

    controlGenecountsDf = controlAnnotationsDf[controlAnnotationsDf.feature_type == annotationType][['id', 'gene']].groupby(
            by=['gene']
        ).agg(
            'count'
        ).reset_index().rename(columns={'id': 'control_genes_count'})

    mergedGenecountsDf = controlGenecountsDf.merge(
        highriskGenecountsDf,
        how='left',
        on=['gene']
    ).fillna(0)

    mergedGenecountsDf['control_genes_proportion'] = mergedGenecountsDf.control_genes_count/mergedGenecountsDf.control_genes_count.sum()

    mergedGenecountsDf['expected_genes_count'] = mergedGenecountsDf.control_genes_proportion * mergedGenecountsDf.high_risk_genes_count.sum()

    filteredGenecountsDf = mergedGenecountsDf[(mergedGenecountsDf.high_risk_genes_count >= 5) & (mergedGenecountsDf.high_risk_genes_count > 0)]

    dfDict[annotationType] = filteredGenecountsDf

    if(filteredGenecountsDf.shape[0] < 2):
        print('Not sufficient data for the test')
        continue

    chi2, p, dof, expected = stats.chi2_contingency(pd.crosstab(filteredGenecountsDf.high_risk_genes_count, filteredGenecountsDf.expected_genes_count), correction=True)
    significant = p < 0.05  # 5% significance level
    print(chi2, p, significant)

annotationType:  CDS
121063.34318681656 0.0 True
annotationType:  ncRNA
624.0000000000001 1.6221851272696822e-34 True
annotationType:  rRNA
3.0 0.22313016014842982 False
annotationType:  tRNA
168.00000000000003 0.0835969531563307 False
annotationType:  tmRNA
Not sufficient data for the test
annotationType:  regulatory_region
Not sufficient data for the test


In [9]:
dfDict['CDS']

Unnamed: 0,gene,control_genes_count,high_risk_genes_count,control_genes_proportion,expected_genes_count
0,aF0104,529,26.0,0.000502,25.852528
1,aRA1,1587,78.0,0.001507,77.557584
2,aRO8,508,26.0,0.000482,24.826246
3,aaa,529,26.0,0.000502,25.852528
6,abc-f,531,26.0,0.000504,25.950269
...,...,...,...,...,...
2073,zapA,531,26.0,0.000504,25.950269
2074,znuA,531,26.0,0.000504,25.950269
2075,znuB,1062,52.0,0.001008,51.900539
2076,znuC,1062,52.0,0.001008,51.900539


In [10]:
dfDict['ncRNA']

Unnamed: 0,gene,control_genes_count,high_risk_genes_count,control_genes_proportion,expected_genes_count
0,5_ureB_sRNA,531,26.0,0.012641,25.799771
1,6S,531,26.0,0.012641,25.799771
2,Bacteria_large_SRP,531,26.0,0.012641,25.799771
3,Bacteria_small_SRP,531,26.0,0.012641,25.799771
5,RNAIII,532,26.0,0.012665,25.848359
6,RNaseP_bact_b,531,26.0,0.012641,25.799771
7,RsaOG,531,26.0,0.012641,25.799771
8,S35,13791,671.0,0.328302,670.065251
9,S414,531,26.0,0.012641,25.799771
10,S774,528,26.0,0.012569,25.65401
