# Create data to perform Multiple Sequnce Alignment for E.coli cohort

## Function to obtain gene locations

In [1]:
def getGeneLocations(gff3Lines, fasta_start_index, geneName):

    import pandas as pd
    from io import StringIO

    gff3Info = StringIO("".join(gff3Lines[: fasta_start_index]))

    df = pd.read_csv(gff3Info, sep='\t', comment='#', header=None, names=['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    df['gene'] = df.attributes.str.split(';').apply(lambda x: [item.split('=')[1] for item in x if item.split('=')[0] == 'gene']).apply(lambda x: x[0] if x else None)

    geneDf = df[df.gene == geneName]

    if not geneDf.empty:
        row = geneDf.iloc[0]
        return row.seq_id, row.strand, row.start, row.end
    else:
        return None, None, None, None

## Function to obtain sequence of a gene

In [2]:
def getSequenceData(gff3Lines, fasta_start_index, seqId, strand, startLocation, endLocation):

    from io import StringIO
    from Bio import SeqIO

    fasta_data = StringIO("".join(gff3Lines[fasta_start_index:]))
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_data, "fasta"))

    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}

    seq = ''
    if strand == '+':
        seq = seq_dict[seqId][startLocation - 1:endLocation].seq
    elif strand == '-':
        seq = ''.join([complement[base.upper()] for base in seq_dict[seqId][startLocation - 1:endLocation]][::-1])

    return seq

## Read data

### Tube ID mapping

In [3]:
import os
from pathlib import Path

import pandas as pd


patientTubeidMappingDf = pd.read_csv(Path(os.environ['GENOMICS_DATA_BASE'], 'patient_tube_id_mapping_full.tsv'), sep='\t')
patientTubeidMappingDf

Unnamed: 0,tube_code,PATIENT_ID,db_ID,pt_age,pt_gender,EPISODE_ID,hospital_admission,hospital_discharge,hospital_in_last_year,genome_species,species_reported,contig_number,length,tube_code_duplicate_or_old,date_of_collection,ID_number,location_additional_02,collected_from_original
0,AH19J072,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Escherichia coli,Escherichia coli,92,5131021,,2019-10-30,19-303-0997,A-7EA;HAEM,Blood
1,AH19J074,213972.0,3JNQS,64,Male,12931474.0,2019-10-17,2019-11-29,yes,Streptococcus sp. D19,Streptococcus mitis group,84,2022991,,2019-10-30,19-303-1174,A-7EA;HAEM,Blood Peripheral
2,AH19B003,526238.0,567AE,85,Male,640261.0,2019-02-04,2019-02-07,no,Escherichia coli,Escherichia coli,93,4942915,,2019-02-03,19-034-1596,A-4WB;GMC,Blood
3,AH20A024,788941.0,3YBDS,58,Female,13194824.0,2020-01-16,2020-01-19,no,Escherichia coli,Escherichia coli,110,5149157,,2020-01-16,20-016-2112,S-ED;Emergency,Blood Venous
4,AH20L041,1023735.0,2VJ78,42,Male,14210147.0,2020-12-18,2021-01-05,yes,Staphylococcus aureus,Staphylococcus aureus,46,2735216,,2020-12-16,20-351-2856,Emergency Dept;nan,Blood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3112,ALF22L076,639020.0,3S6HL,84,Male,17219899.0,2022-12-15,2022-12-20,yes,Streptococcus vestibularis,Streptococcus salivarius group,107,1938627,,2022-12-15,22-349-2150,Ward 4WB,Blood Peripheral
3113,ALF22L089,655404.0,6EWBM,67,Female,,,,,Proteus columbae,Proteus vulgaris,152,3972219,,2022-12-17,22-351-1465,Comm/Amb Clinic,Blood
3114,ALF22L138,2572048.0,YM5PJ,61,Male,,,,,Enterobacter roggenkampii,Enterobacter cloacae complex,66,4876179,,2022-12-27,22-361-0202,Ward 7 East,Hick White Lum
3115,ALF23A102,2054964.0,5DYAH,35,Female,17294287.0,2023-01-12,2023-02-01,yes,Streptococcus constellatus,Streptococcus anginosus group,54,1869828,,2023-01-11,23-011-3305,ICU,Blood Venous


### Admission information

In [4]:
import os
from pathlib import Path

import pandas as pd


sourceDirName = Path(os.environ['DATA_DIR'], 'sepsis_prediction', 'lstm_initial_trials', '00_source_files')
admissionsDf = pd.read_csv(Path(sourceDirName, '2024-04-03-admissions.csv'))
admissionsDf

Unnamed: 0,PATIENT_ID,EPISODE_ID,PARENT_EPISODE_ID,start_date,end_date,PRIMARY_VISIT_REASON,E_LOS,H_LOS,DATEOFDEATH_DATETIME,ADMIT_TYPE,ADMITTING_WARD,EPISODE_ORGANISM_IDENTIFIED,ORGANISM,Organism_FIRST_NOTED,Organism_LAST_NOTED
0,2141606,8800,8800,2013-08-31 10:14:00.000,2013-08-31 15:04:00.000,DIALYSIS,5.00,5.00,2020-07-10 14:00:00.000,Planned Admission,A-CGD - Alfred/Caulfield Haemodialysis,85662,Staphylococcus aureus (MRSA),2014-08-31 08:45:41.000,2014-09-12 20:20:38.000
1,2141606,8800,8800,2013-08-31 10:14:00.000,2013-08-31 15:04:00.000,DIALYSIS,5.00,5.00,2020-07-10 14:00:00.000,Planned Admission,A-CGD - Alfred/Caulfield Haemodialysis,9984732,Staphylococcus aureus (MRSA),2014-08-31 08:17:49.000,2014-08-31 08:17:49.000
2,2141606,9467,9467,2013-09-03 13:30:00.000,2013-09-03 17:36:00.000,DIALYSIS,4.00,4.00,2020-07-10 14:00:00.000,Planned Admission,A-CGD - Alfred/Caulfield Haemodialysis,85662,Staphylococcus aureus (MRSA),2014-08-31 08:45:41.000,2014-09-12 20:20:38.000
3,2141606,9467,9467,2013-09-03 13:30:00.000,2013-09-03 17:36:00.000,DIALYSIS,4.00,4.00,2020-07-10 14:00:00.000,Planned Admission,A-CGD - Alfred/Caulfield Haemodialysis,9984732,Staphylococcus aureus (MRSA),2014-08-31 08:17:49.000,2014-08-31 08:17:49.000
4,2141606,9871,9871,2013-09-05 11:00:00.000,2013-09-06 18:16:00.000,CONSTIPATION & BLOATING - PHX UMBILICAL HERNIA,31.00,31.00,2020-07-10 14:00:00.000,Admission from ED,A-CC - Emergency Dept Cubicles,85662,Staphylococcus aureus (MRSA),2014-08-31 08:45:41.000,2014-09-12 20:20:38.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216684,642636,18390389,18390389,2024-01-17 11:45:00.000,2024-01-24 23:59:59.000,,,,,,,18696838,Burkholderia cepacia complex,2024-02-28 15:30:06.000,2024-02-28 15:30:06.000
216685,642636,18406829,18406829,2024-02-27 15:45:00.000,2024-03-05 23:59:59.000,4 Month Review,,,,,,18696838,Burkholderia cepacia complex,2024-02-28 15:30:06.000,2024-02-28 15:30:06.000
216686,642636,18690618,18690618,2024-02-27 12:30:00.000,2024-03-05 23:59:59.000,,,,,,,18696838,Burkholderia cepacia complex,2024-02-28 15:30:06.000,2024-02-28 15:30:06.000
216687,642636,18696838,18696838,2024-02-28 08:28:23.000,2024-02-28 15:08:00.000,Chest pain NEC,6.67,6.67,,,,18696838,Burkholderia cepacia complex,2024-02-28 15:30:06.000,2024-02-28 15:30:06.000


### Merge data

In [9]:
mergedDf = patientTubeidMappingDf[['tube_code', 'PATIENT_ID', 'date_of_collection']].drop_duplicates().merge(
    admissionsDf[['PATIENT_ID', 'DATEOFDEATH_DATETIME']].drop_duplicates(),
    how='inner',
    on=['PATIENT_ID']
)
mergedDf['DATEOFDEATH_DATETIME'] = pd.to_datetime(mergedDf['DATEOFDEATH_DATETIME'], format='%Y-%m-%d %H:%M:%S.%f')
mergedDf['date_of_collection'] = pd.to_datetime(mergedDf['date_of_collection'], format='%Y-%m-%d')
mergedDf['death_30_days'] = (((mergedDf.DATEOFDEATH_DATETIME - mergedDf.date_of_collection).dt.days >= 0) & ((mergedDf.DATEOFDEATH_DATETIME - mergedDf.date_of_collection).dt.days < 30))
mergedDf['death_30_days'] = mergedDf['death_30_days'].apply(lambda x: 1 if x else 0)
mergedDf = mergedDf.drop(columns=['date_of_collection', 'DATEOFDEATH_DATETIME'])
mergedDf

Unnamed: 0,tube_code,PATIENT_ID,death_30_days
0,AH19J072,213972.0,0
1,AH19J074,213972.0,0
2,AH19B003,526238.0,0
3,AH20A024,788941.0,0
4,AH20L041,1023735.0,0
...,...,...,...
3076,ALF22K143,2638484.0,0
3077,ALF22L076,639020.0,1
3078,ALF22L089,655404.0,0
3079,ALF23A102,2054964.0,0


In [10]:
mergedDf.death_30_days.value_counts()

death_30_days
0    2663
1     418
Name: count, dtype: int64

### Gene sequences

In [24]:
import os
from pathlib import Path

from Bio.SeqRecord import SeqRecord


fastaData = {}
gff3Path = Path(os.environ['GENOMICS_DATA_BASE'], 'annotations', 'e_coli', 'gff3_original')

for gff3FileName in os.listdir(gff3Path):

    if gff3FileName.split('.')[0] in mergedDf[mergedDf.death_30_days == 0].tube_code.values:

        gff3File = Path(gff3Path, gff3FileName)

        gff3Lines = []
        fasta_start_index = None
        with open(gff3File, 'r') as in_handle:
            gff3Lines = in_handle.readlines()

        for i, line in enumerate(gff3Lines):
            if line.strip() == "##FASTA":
                fasta_start_index = i + 1
                break

        seq_id, strand, start, end = getGeneLocations(gff3Lines=gff3Lines, fasta_start_index=fasta_start_index, geneName='fhuB')
        if seq_id and start and end:
            seq = getSequenceData(gff3Lines=gff3Lines, fasta_start_index=fasta_start_index, seqId=str(seq_id), strand=strand, startLocation=start, endLocation=end)
            updatedRecord = SeqRecord(seq, id=gff3FileName.split('.')[0], name=gff3FileName.split('.')[0], description=gff3FileName.split('.')[0])
            fastaData[gff3FileName.split('.')[0]] = updatedRecord



## Write fasta file for `fhuB` gene

In [25]:
import os
from pathlib import Path

from Bio import SeqIO


saveDirPath = Path(os.environ['GENOMICS_DATA_BASE'], 'annotations', 'e_coli', 'gene_sequences', 'fhub_gene_survived')
saveDirPath.mkdir(parents=True, exist_ok=True)

SeqIO.write(fastaData.values(), Path(saveDirPath, 'sequence.fna'), 'fasta')

749