In [11]:
import numpy as np
import pandas as pd

import Bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os

In [12]:
for seq_record in SeqIO.parse("sequences/genomes/AY029768.1_MYS.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Malaysian strain genome size: {len(seq_record)}")

Malaysian strain genome size: 18246


In [13]:
for seq_record in SeqIO.parse("sequences/genomes/AY988601.1_BGD.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Bangaldeshi strain genome size: {len(seq_record)}")

Bangaldeshi strain genome size: 18252


```bash
bowtie2-build sequences/genomes/AY029768_1.MYS_CDS.fna MYS_cds

bowtie2-build sequences/genomes/AY988601_1.BGD_CDS.fna BGD_cds
```

# BLAST the glycoprotein sequence

83 aligned sequences stored at `glyco_aln.fasta`

Descriptions stored at `glyco_blast_descriptions.csv`. Need this to get the isolate locations and other metadata.

Full sequences stored at `glyco_blast_83.fasta`


# BLAST the phosphoprotein sequence

83 aligned sequences stored at `phospho_aln.fasta`

Descriptions stored at `phospho_blast_descriptions.csv`. Need this to get the isolate locations and other metadata.

Full sequences stored at `phospho_blast_80.fasta`

# Check lengths of sequences in the alignment files

In [86]:
def read_fasta(fName):
    
    lengths = []
    names = []
    seqs = []

    for seq_record in SeqIO.parse(fName, "fasta"):

        if "_" in seq_record.id:
            names.append(seq_record.id.split("_")[0])
        else:
            names.append(seq_record.id)
        seqs.append(seq_record.seq)
        lengths.append(len(seq_record))

    print(f"Found {len(seqs)} sequences in {os.path.basename(fName)}")
    
    res_dict = dict(zip(np.array(names), seqs))
    
    return dict(sorted(res_dict.items(), key=lambda x: x[0])), np.array(lengths)

In [105]:
G = read_fasta("sequences/Fasta/G_seqs.fasta")
P = read_fasta("sequences/Fasta/P_seqs.fasta")

Found 81 sequences in G_seqs.fasta
Found 79 sequences in P_seqs.fasta
