# Create data to perform Multiple Sequnce Alignment for Staphylococcus aureus cohort (Sampled)

## Function to obtain gene locations

In [1]:
def getGeneLocations(gff3Lines, fasta_start_index, geneName):

    import pandas as pd
    from io import StringIO

    gff3Info = StringIO("".join(gff3Lines[: fasta_start_index]))

    df = pd.read_csv(gff3Info, sep='\t', comment='#', header=None, names=['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    df['gene'] = df.attributes.str.split(';').apply(lambda x: [item.split('=')[1] for item in x if item.split('=')[0] == 'gene']).apply(lambda x: x[0] if x else None)

    geneDf = df[df.gene == geneName]

    if not geneDf.empty:
        row = geneDf.iloc[0]
        return row.seq_id, row.start, row.end
    else:
        return None, None, None


## Function to obtain sequence of a gene

In [2]:
def getSequenceData(gff3Lines, fasta_start_index, seqId, startLocation, endLocation):

    from io import StringIO
    from Bio import SeqIO

    fasta_data = StringIO("".join(gff3Lines[fasta_start_index:]))
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_data, "fasta"))

    return seq_dict[seqId][startLocation - 1:endLocation]

## Read all tube codes

In [28]:
import os
from pathlib import Path

from Bio.SeqRecord import SeqRecord


tubeCodes = []
gff3Path = Path(os.environ['GENOMICS_DATA_BASE'], 'annotations', 's_aureus', 'gff3_original_v1')

for gff3FileName in os.listdir(gff3Path):

    gff3File = Path(gff3Path, gff3FileName)

    tubeCode = gff3FileName.split('.')[0]

    tubeCodes.append(tubeCode)

len(tubeCodes)

557

## Read mortality data

In [25]:
import os
from pathlib import Path
import pandas as pd


mortalityDf = pd.read_csv(Path(os.environ['EHR_DATA_BASE'], 'data', 'full_cohort', 'tube_id_mortality_labels.csv'))
mortalityDf

Unnamed: 0,PATIENT_ID,tube_code,death_7_day,death_14_day,death_30_day
0,18849.0,AH19I003,0,0,0
1,41308.0,AH21E085,0,0,0
2,49412.0,ALF22B136,0,0,0
3,50056.0,AH20B011,0,0,0
4,51211.0,ALF23D002,0,0,0
...,...,...,...,...,...
3079,2680034.0,ALF23C129,1,1,1
3080,2680486.0,ALF23C135,0,0,0
3081,2681377.0,ALF23C174,0,0,0
3082,2682433.0,ALF23C184,0,0,0


In [29]:
filteredMortalityDf = mortalityDf[mortalityDf.tube_code.isin(tubeCodes)]
filteredMortalityDf

Unnamed: 0,PATIENT_ID,tube_code,death_7_day,death_14_day,death_30_day
1,41308.0,AH21E085,0,0,0
9,56749.0,AH21G070,0,0,0
13,64921.0,AH20I016,0,0,0
15,77432.0,AH21A081,0,0,0
16,79270.0,ALF22L085,0,0,0
...,...,...,...,...,...
3065,2669870.0,ALF23A119,0,0,0
3072,2676010.0,ALF23C009,0,0,0
3073,2676371.0,ALF23C055,0,0,0
3078,2679272.0,ALF23C124,0,0,0


## Create data subset

In [30]:
positiveMortalityDf = filteredMortalityDf[(filteredMortalityDf.death_30_day == 1)][['tube_code', 'death_30_day']].drop_duplicates()
negativeMortalityDf = filteredMortalityDf[(filteredMortalityDf.death_30_day == 0)][['tube_code', 'death_30_day']].sample(n=positiveMortalityDf.shape[0], random_state=42)
sampledMortalityDf = pd.concat([positiveMortalityDf, negativeMortalityDf], ignore_index=True)
sampledMortalityDf

Unnamed: 0,tube_code,death_30_day
0,AH21B002,1
1,ALF22L090,1
2,AH21L073,1
3,AH21C022,1
4,AH18J081,1
...,...,...
119,ALF22E035,0
120,AH19J068,0
121,AH19L053,0
122,ALF23A119,0


In [39]:
sampledMortalityDf.death_30_day.value_counts()

death_30_day
1    62
0    62
Name: count, dtype: int64

## Read essC gene sequences

In [41]:
import os
from pathlib import Path

from Bio.SeqRecord import SeqRecord


fastaData = {}
gff3Path = Path(os.environ['GENOMICS_DATA_BASE'], 'annotations', 's_aureus', 'gff3_original_v1')

for i, row in sampledMortalityDf.iterrows():

    tubeCode = row['tube_code']
    gff3FileName = tubeCode + '.gff3'
    gff3File = Path(gff3Path, gff3FileName)

    gff3Lines = []
    fasta_start_index = None
    with open(gff3File, 'r') as in_handle:
        gff3Lines = in_handle.readlines()

    for i, line in enumerate(gff3Lines):
        if line.strip() == "##FASTA":
            fasta_start_index = i + 1
            break

    seq_id, start, end = getGeneLocations(gff3Lines=gff3Lines, fasta_start_index=fasta_start_index, geneName='essC')
    if seq_id and start and end:
        record = getSequenceData(gff3Lines=gff3Lines, fasta_start_index=fasta_start_index, seqId=str(seq_id), startLocation=start, endLocation=end)
        updatedRecord = SeqRecord(record.seq, id=gff3FileName.split('.')[0], name=gff3FileName.split('.')[0], description=gff3FileName.split('.')[0])
        fastaData[gff3FileName.split('.')[0]] = updatedRecord
    else:
        print('tubeCode failed:', tubeCode)

tubeCode failed: AH19C070


## Write fasta file

In [38]:
import os
from pathlib import Path

from Bio import SeqIO


SeqIO.write(fastaData.values(), Path(os.environ['GENOMICS_DATA_BASE'], 'annotations', 's_aureus', 'gene_sequences', 'essC', 'sample', 'sequences.fna'), 'fasta')

123