In [27]:
import pandas as pd
from Bio import SeqIO

In [18]:
# make list of sites in sequential chimeric numbering, starting at first ectodomain codon, ending before stop codon
seq_chimera = []
for i in range(20, 570):
    seq_chimera.append(i)

# make list of sites in h3 numbering
# conversion from chimeric sequential is to subtract 19, HA1 runs to site 329
h3 = []
for i in range(20-19, 330):
    h3.append(i)

# append HA2 sites (HA2)1-(HA2)221
for i in range(1, 222):
    h3.append(f'(HA2){i}')

# ensure that columns match
assert len(seq_chimera) == len(h3)

In [22]:
site_numbering_map = pd.DataFrame(
    {'chimeric_sequential_site': seq_chimera,
     'h3_site': h3        
    })

Now we need to add WT amino acid identity. Pull from `hk-45-2019.fasta` file

In [30]:
site_numbering_map

Unnamed: 0,chimeric_sequential_site,h3_site
0,20,1
1,21,2
2,22,3
3,23,4
4,24,5
...,...,...
545,565,(HA2)217
546,566,(HA2)218
547,567,(HA2)219
548,568,(HA2)220


In [39]:
# read in just the first entry of the sequence file
wt_nts = next(SeqIO.parse('hk-45-2019.fasta', 'fasta'))

# translate from nucleotide sequence
wt_seq = wt_nts.seq.translate()

# cut to start at site 1 in h3 numbering, and cut off stop codon
wt_seq_h3 = wt_seq[16:-1]

wt_h3_aa_list = []
for aa in wt_seq_h3:
    wt_h3_aa_list.append(aa)

# make sure it's the correct length for df
assert len(wt_h3_aa_list) == len(h3)

In [45]:
site_numbering_map = pd.DataFrame(
    {'sequential_site': seq_chimera,
     'reference_site': h3,
     'reference_wt': wt_h3_aa_list
    })

In [46]:
site_numbering_map

Unnamed: 0,sequential_site,reference_site,reference_wt
0,20,1,Q
1,21,2,K
2,22,3,I
3,23,4,P
4,24,5,G
...,...,...,...
545,565,(HA2)217,C
546,566,(HA2)218,N
547,567,(HA2)219,I
548,568,(HA2)220,C


In [47]:
site_numbering_map.to_csv('site_map.csv', index=False)