In [1]:
import numpy as np
import pandas as pd

import Bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
from urllib.request import urlopen
import re

In [2]:
#for seq_record in SeqIO.parse("sequences/genome/AY029768.1_MYS.fasta", "fasta"):
for seq_record in SeqIO.parse("sequences/genome/NC_00278.1_MYS.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Malaysian strain genome size: {len(seq_record)}")

Malaysian strain genome size: 18246


In [3]:
for seq_record in SeqIO.parse("sequences/genome/AY988601.1_BGD.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Bangaldeshi strain genome size: {len(seq_record)}")

Bangaldeshi strain genome size: 18252


# 1. BLAST the glycoprotein and phosphoprotein sequences to get all sequences from NCBI

Used reference sequences `sequences/PG/ref_G_seq` and `sequences/PG/ref_P_seq` for the glycoprotein and phosphoproteins. Queried these on BLAST using <i>Henipaviruses</i> as the search organism. 

83 full search results for G (could be individual protein, multiple, or full genome) stored at `sequences/PG/glyco_blast_83.fasta`, and 80 full search results for P stored at `sequences/PG/phospho_blast_80.fasta`. Removed duplicates in Geneious.

Descriptions stored at `sequences/glyco_blast_descriptions.csv` and `sequences/phospho_blast_descriptions.csv`.

```bash
bowtie2-build sequences/genome/AY029768.1_MYS_CDS.fna MYS_cds
bowtie2 -x MYS_cds -f sequences/PG/G_seqs.fasta -S G_aln.sam
```

The sequence headers were updated, and the V, W, and C proteins were removed because they are duplicates of the phosphoprotein sequence. <b>This was found to cause problems when aligning the glycoprotein sequences</b>. It makes sense that you shouldn't have redundant sequences in a reference sequence, but I'm not sure why it didn't cause problems for the phosphoprotein alignment. 

In [4]:
glyco_descript = pd.read_csv("sequences/glyco_blast_descriptions.csv")
phospho_descript = pd.read_csv("sequences/phospho_blast_descriptions.csv")

In [7]:
def read_fasta(fName):
    
    lengths = []
    names = []
    seqs = []

    for seq_record in SeqIO.parse(fName, "fasta"):

        if "_" in seq_record.id:
            names.append(seq_record.id.split("_")[0])
        else:
            names.append(seq_record.id)
        seqs.append(seq_record.seq)
        lengths.append(len(seq_record))

    print(f"Found {len(seqs)} sequences in {os.path.basename(fName)}")
    
    res_dict = dict(zip(np.array(names), seqs))
    
    return dict(sorted(res_dict.items(), key=lambda x: x[0])), np.array(lengths)

In [10]:
G = read_fasta("sequences/PG/G_seqs.fasta")
P = read_fasta("sequences/PG/P_seqs.fasta")

Found 83 sequences in G_seqs.fasta
Found 80 sequences in P_seqs.fasta


In [9]:
set(G_low[0].keys()) - set(G[0].keys())

{'JF899340', 'MH891777', 'MN549407', 'MN549410'}

# 2. Single nucleotide polymorphism (SNP) calling

## End goal is to convert the FASTA file into a VCF (variant calling format) file

<b>Amazing tutorial:</b> https://www.ebi.ac.uk/sites/ebi.ac.uk/files/content.ebi.ac.uk/materials/2014/140217_AgriOmics/dan_bolser_snp_calling.pdf

## Exact code to run for both P and G aligning to both reference genomes:

```bash
bash align_make_vcf.sh "sequences/genome/AY988601.1_BGD_CDS.fna" "sequences/PG/P_seqs.fasta" "alignments" "P_BGD"
bash align_make_vcf.sh "sequences/genome/NC_00278.1_MYS_CDS.fna" "sequences/PG/P_seqs.fasta" "alignments" "P_MYS"

bash align_make_vcf.sh "sequences/genome/AY988601.1_BGD_CDS.fna" "sequences/PG/G_seqs.fasta" "alignments" "G_BGD"
bash align_make_vcf.sh "sequences/genome/NC_00278.1_MYS_CDS.fna" "sequences/PG/G_seqs.fasta" "alignments" "G_MYS"
```

The above bash script runs the following steps, adapted from the linked tutorial above:

<b></b>
```bash
bwa index sequences/genome/AY029768.1_MYS_CDS.fna
```

<b>Perform the alignment</b>
```bash
bwa aln sequences/genome/AY029768.1_MYS_CDS.fna sequences/PG/P_seqs.fasta > alignments/P_MYS_aln.sai
```

<b>Convert to SAM file format, which is human-readable</b>
```bash
bwa samse sequences/genome/AY029768.1_MYS_CDS.fna alignments/P_MYS_aln.sai sequences/PG/P_seqs.fasta > alignments/P_MYS_aln.sam
```

<b>Convert SAM to BAM and sort the BAM file</b>
```bash
   samtools view -b alignments/G_BGD_aln.sam > alignments/G_BGD_aln.bam
   samtools sort alignments/G_BGD_aln.bam -o alignments/G_BGD_aln_sorted.bam
```

<b>Index the genome file again with `samtools`</b>
```bash
samtools faidx sequences/genome/AY029768.1_MYS_CDS.fna
```

<b>Run 'mpileup' to generate VCF format</b>
```bash
bcftools mpileup -f sequences/genome/AY988601.1_BGD_CDS.fna alignments/P_MYS_aln_sorted.bam > alignments/P_MYS_aln.bcf
```

<b>Call SNPs</b>
```bash
bcftools view -v snps alignments/P_MYS_aln.bcf > alignments/P_MYS_SNPs.vcf
```

# 3. Make trees using the G and P CDS's

```bash
fasttree -nt sequences/PG/G_seqs.fasta > trees/G_cds.nwk
fasttree -nt sequences/PG/P_seqs.fasta > trees/P_cds.nwk
```

Another reference: https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/01_alignment.html

# 4. Deduplicate and remake trees -- use all sequences (83 G, 80 P)

In [20]:
G_seqs = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/G_seqs.fasta", "fasta")]
P_seqs = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/P_seqs.fasta", "fasta")]

## More sanity checks

In [22]:
G_ids = np.array(list(zip(*G_seqs))[0])
P_ids = np.array(list(zip(*P_seqs))[0])

# 78 sequences are the same
print(len(set(G_ids).intersection(P_ids)))

# HM545086, MH891777, JF899340, AF238467, AY858111 are all G only
print(set(G_ids) - set(P_ids))

# HM545087 is P only. MH891774 is P and C only
# MN549407 and MN549410 were the very low quality G sequences that were removed
# according to the entries, they are partial genomes isolated from bats, probably explains the low quality
print(set(P_ids) - set(G_ids))

78
{'AY858111', 'HM545086', 'MH891777', 'AF238467', 'JF899340'}
{'MH891774', 'HM545087'}


In [23]:
seqs_G_df = pd.DataFrame(list(zip(*G_seqs))[-1]).rename(columns={0:"Seq"})
seqs_G_df["ID"] = G_ids

seqs_P_df = pd.DataFrame(list(zip(*P_seqs))[-1]).rename(columns={0:"Seq"})
seqs_P_df["ID"] = P_ids

# check that they are sorted alphabetically by ID
assert sum(seqs_G_df.ID.values != np.sort(seqs_G_df.ID.values)) == 0
assert sum(seqs_P_df.ID.values != np.sort(seqs_P_df.ID.values)) == 0

# keeps only the first occurrence. So there are 46 unique glycoprotein sequences and 50 unique phosphoprotein sequences
len(seqs_G_df["Seq"].drop_duplicates()), len(seqs_P_df["Seq"].drop_duplicates())

(50, 51)

## The next cell writes the unique sequences to new Fasta files

In [24]:
keep_G = seqs_G_df.drop_duplicates("Seq")
keep_P = seqs_P_df.drop_duplicates("Seq")

with open("sequences/PG/G_deduplicated.fasta", "w+") as file:
    
    for _, row in keep_G.iterrows():
        file.write(">" + row["ID"] + "\n")
        file.write(row["Seq"] + "\n")
        
with open("sequences/PG/P_deduplicated.fasta", "w+") as file:
    
    for _, row in keep_P.iterrows():
        file.write(">" + row["ID"] + "\n")
        file.write(row["Seq"] + "\n")

In [25]:
G_dedup = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/G_deduplicated.fasta", "fasta")]
P_dedup = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/P_deduplicated.fasta", "fasta")]

```bash
fasttree -nt sequences/PG/G_deduplicated.fasta > trees/G_dedup.nwk
fasttree -nt sequences/PG/P_deduplicated.fasta > trees/P_dedup.nwk
```

In [26]:
len(G_dedup), len(P_dedup)

(50, 51)

In [37]:
lengths_G = [len(seq) for seq in seqs_G_df.Seq]
lengths_P = [len(seq) for seq in seqs_P_df.Seq]

print(np.unique(lengths_G), np.unique(lengths_P))

[1809] [2130]
