In [1]:
import numpy as np
import pandas as pd

import Bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os

In [7]:
for seq_record in SeqIO.parse("sequences/genome/AY029768.1_MYS.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Malaysian strain genome size: {len(seq_record)}")

Malaysian strain genome size: 18246


In [6]:
for seq_record in SeqIO.parse("sequences/genome/AY988601.1_BGD.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Bangaldeshi strain genome size: {len(seq_record)}")

Bangaldeshi strain genome size: 18252


# 1. Single nucleotide polymorphism (SNP) calling

## End goal is to convert the FASTA file into a VCF (variant calling format) file

<b>Amazing tutorial:</b> https://www.ebi.ac.uk/sites/ebi.ac.uk/files/content.ebi.ac.uk/materials/2014/140217_AgriOmics/dan_bolser_snp_calling.pdf

Adapted the following steps from it

<b>Index the genome file</b>
```bash
bwa index sequences/genome/AY029768.1_MYS.fasta
```

<b>Perform the alignment</b>
```bash
bwa aln sequences/genome/AY029768.1_MYS.fasta sequences/P_G_proteins/P_seqs.fasta > alignments/P_MYS_aln.sai
```

<b>Convert to SAM file format, which is human-readable</b>
```bash
bwa samse sequences/genome/AY029768.1_MYS.fasta alignments/P_MYS_aln.sai sequences/P_G_proteins/P_seqs.fasta > alignments/P_MYS_aln.sam
```

<b>Convert SAM to BAM and sort the BAM file</b>
```bash
   samtools view -S -b alignments/P_MYS_aln.sam > alignments/P_MYS_aln.bam
   samtools sort alignments/P_MYS_aln.bam -o alignments/P_MYS_aln_sorted.bam
```

<b>Index the genome file again with `samtools`</b>
```bash
samtools faidx sequences/genome/AY029768.1_MYS.fasta
```

<b>Run 'mpileup' to generate VCF format</b>
```bash
bcftools mpileup -f sequences/genome/AY029768.1_MYS.fasta alignments/P_MYS_aln_sorted.bam > alignments/P_MYS_aln.bcf
```

<b>Call SNPs</b>
```bash
bcftools view -v snps alignments/P_MYS_aln.bcf > alignments/P_MYS_SNPs.vcf
```

79 phosphoprotein sequences, 81 glycoprotein sequences.

Two glycoprotein sequences did not align to either genome. 

# 2. Make trees using the G and P CDS's

```bash
fasttree -nt sequences/P_G_proteins/G_seqs.fasta > trees/G_cds.nwk
fasttree -nt sequences/P_G_proteins/P_seqs.fasta > trees/P_cds.nwk
```

# BLAST the glycoprotein sequence

83 aligned sequences stored at `glyco_aln.fasta`

Descriptions stored at `glyco_blast_descriptions.csv`. Need this to get the isolate locations and other metadata.

Full sequences stored at `glyco_blast_83.fasta`


# BLAST the phosphoprotein sequence

83 aligned sequences stored at `phospho_aln.fasta`

Descriptions stored at `phospho_blast_descriptions.csv`. Need this to get the isolate locations and other metadata.

Full sequences stored at `phospho_blast_80.fasta`

# Check lengths of sequences in the alignment files

In [86]:
def read_fasta(fName):
    
    lengths = []
    names = []
    seqs = []

    for seq_record in SeqIO.parse(fName, "fasta"):

        if "_" in seq_record.id:
            names.append(seq_record.id.split("_")[0])
        else:
            names.append(seq_record.id)
        seqs.append(seq_record.seq)
        lengths.append(len(seq_record))

    print(f"Found {len(seqs)} sequences in {os.path.basename(fName)}")
    
    res_dict = dict(zip(np.array(names), seqs))
    
    return dict(sorted(res_dict.items(), key=lambda x: x[0])), np.array(lengths)

In [105]:
G = read_fasta("sequences/Fasta/G_seqs.fasta")
P = read_fasta("sequences/Fasta/P_seqs.fasta")

Found 81 sequences in G_seqs.fasta
Found 79 sequences in P_seqs.fasta
