# Create data to perform Multiple Sequnce Alignment for Staphylococcus aureus cohort

## Function to obtain gene locations

In [5]:
def getGeneLocations(gff3Lines, fasta_start_index, geneName):

    import pandas as pd
    from io import StringIO

    gff3Info = StringIO("".join(gff3Lines[: fasta_start_index]))

    df = pd.read_csv(gff3Info, sep='\t', comment='#', header=None, names=['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
    df['gene'] = df.attributes.str.split(';').apply(lambda x: [item.split('=')[1] for item in x if item.split('=')[0] == 'gene']).apply(lambda x: x[0] if x else None)

    geneDf = df[df.gene == geneName]

    if not geneDf.empty:
        row = geneDf.iloc[0]
        return row.seq_id, row.strand, row.start, row.end
    else:
        return None, None, None, None

## Function to obtain sequence of a gene

In [13]:
def getSequenceData(gff3Lines, fasta_start_index, seqId, strand, startLocation, endLocation):

    from io import StringIO
    from Bio import SeqIO

    fasta_data = StringIO("".join(gff3Lines[fasta_start_index:]))
    seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_data, "fasta"))

    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}

    seq = ''
    if strand == '+':
        seq = seq_dict[seqId][startLocation - 1:endLocation].seq
    elif strand == '-':
        seq = ''.join([complement[base.upper()] for base in seq_dict[seqId][startLocation - 1:endLocation]][::-1])

    return seq

## Read `acrF` gene sequences

In [14]:
import os
from pathlib import Path

from Bio.SeqRecord import SeqRecord


fastaData = {}
gff3Path = Path(os.environ['GENOMICS_DATA_BASE'], 'annotations', 'e_coli', 'gff3_original')

for gff3FileName in os.listdir(gff3Path):

    gff3File = Path(gff3Path, gff3FileName)

    gff3Lines = []
    fasta_start_index = None
    with open(gff3File, 'r') as in_handle:
        gff3Lines = in_handle.readlines()

    for i, line in enumerate(gff3Lines):
        if line.strip() == "##FASTA":
            fasta_start_index = i + 1
            break

    seq_id, strand, start, end = getGeneLocations(gff3Lines=gff3Lines, fasta_start_index=fasta_start_index, geneName='acrF')
    if seq_id and start and end:
        seq = getSequenceData(gff3Lines=gff3Lines, fasta_start_index=fasta_start_index, seqId=str(seq_id), strand=strand, startLocation=start, endLocation=end)
        updatedRecord = SeqRecord(seq, id=gff3FileName.split('.')[0], name=gff3FileName.split('.')[0], description=gff3FileName.split('.')[0])
        fastaData[gff3FileName.split('.')[0]] = updatedRecord



## Write fasta file for `acrF` gene

In [15]:
import os
from pathlib import Path

from Bio import SeqIO


saveDirPath = Path(os.environ['GENOMICS_DATA_BASE'], 'annotations', 'e_coli', 'gene_sequences')
saveDirPath.mkdir(parents=True, exist_ok=True)

SeqIO.write(fastaData.values(), Path(saveDirPath, 'acrF.fna'), 'fasta')

857