In [1]:
import numpy as np
import pandas as pd

import Bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os

In [2]:
#for seq_record in SeqIO.parse("sequences/genome/AY029768.1_MYS.fasta", "fasta"):
for seq_record in SeqIO.parse("sequences/genome/NC_00278.1_MYS.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Malaysian strain genome size: {len(seq_record)}")

Malaysian strain genome size: 18246


In [3]:
for seq_record in SeqIO.parse("sequences/genome/AY988601.1_BGD.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Bangaldeshi strain genome size: {len(seq_record)}")

Bangaldeshi strain genome size: 18252


# 1. BLAST the glycoprotein and phosphoprotein sequences to get all sequences from NCBI

Used reference sequences `sequences/PG/ref_G_seq` and `sequences/PG/ref_P_seq` for the glycoprotein and phosphoproteins. Queried these on BLAST using <i>Henipaviruses</i> as the search organism. 

83 full search results for G (could be individual protein, multiple, or full genome) stored at `sequences/PG/glyco_blast_83.fasta`, and 80 full search results for P stored at `sequences/PG/phospho_blast_80.fasta`. Removed duplicates in Geneious.

Descriptions stored at `sequences/glyco_blast_descriptions.csv` and `sequences/phospho_blast_descriptions.csv`. Need this to get the isolate locations and other metadata.

```bash
bowtie2-build sequences/genome/AY029768.1_MYS_CDS.fna MYS_cds
bowtie2 -x MYS_cds -f sequences/PG/G_seqs.fasta -S G_aln.sam
```

From the above, found that two glycoprotein sequences did not align to either genome because they have too many ambiguous nucleotides in them. These are <b>MN549407</b> and <b>MN549410</b>. They were removed from the .fasta files.

The sequence headers were updated, and the V, W, and C proteins were removed because they are duplicates of the phosphoprotein sequence. <b>This was found to cause problems when aligning the glycoprotein sequences</b>. It makes sense that you shouldn't have redundant sequences in a reference sequence, but I'm not sure why it didn't cause problems for the phosphoprotein alignment. 

In [4]:
glyco_descript = pd.read_csv("sequences/glyco_blast_descriptions.csv")
phospho_descript = pd.read_csv("sequences/phospho_blast_descriptions.csv")

In [5]:
glyco_descript

Unnamed: 0,Description,Scientific Name,Max Score,Total Score,Query Cover,E value,Per. ident,Acc. Len,Accession
0,"Nipah virus N gene, P gene, M gene, F gene, G ...",Nipah henipavirus,3341,3341,100%,0.0,100.00,14973,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
1,Nipah virus strain Malaysian glycoprotein (G) ...,Nipah henipavirus,3341,3341,100%,0.0,100.00,1809,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
2,"Nipah henipavirus isolate 808651, complete genome",Nipah henipavirus,3275,3275,100%,0.0,99.34,18231,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
3,"Nipah virus isolate IRF0158, partial genome",Nipah henipavirus,3269,3269,100%,0.0,99.28,18214,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
4,"Nipah virus isolate IRF0160, partial genome",Nipah henipavirus,3269,3269,100%,0.0,99.28,18212,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
...,...,...,...,...,...,...,...,...,...
78,"Nipah henipavirus isolate MCL-19-BAT-572-9, pa...",Nipah henipavirus,1428,2341,87%,0.0,93.29,18132,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
79,"Nipah virus glycoprotein (G) gene, partial cds",Nipah henipavirus,1363,1363,52%,0.0,92.53,954,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
80,"Nipah henipavirus isolate MCL-19-BAT-574-7, pa...",Nipah henipavirus,1214,2088,79%,0.0,92.83,17815,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
81,"Nipah henipavirus isolate MCL-19-BAT-574-5, pa...",Nipah henipavirus,1199,2433,91%,0.0,93.05,18131,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."


In [6]:
glyco_descript

Unnamed: 0,Description,Scientific Name,Max Score,Total Score,Query Cover,E value,Per. ident,Acc. Len,Accession
0,"Nipah virus N gene, P gene, M gene, F gene, G ...",Nipah henipavirus,3341,3341,100%,0.0,100.00,14973,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
1,Nipah virus strain Malaysian glycoprotein (G) ...,Nipah henipavirus,3341,3341,100%,0.0,100.00,1809,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
2,"Nipah henipavirus isolate 808651, complete genome",Nipah henipavirus,3275,3275,100%,0.0,99.34,18231,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
3,"Nipah virus isolate IRF0158, partial genome",Nipah henipavirus,3269,3269,100%,0.0,99.28,18214,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
4,"Nipah virus isolate IRF0160, partial genome",Nipah henipavirus,3269,3269,100%,0.0,99.28,18212,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
...,...,...,...,...,...,...,...,...,...
78,"Nipah henipavirus isolate MCL-19-BAT-572-9, pa...",Nipah henipavirus,1428,2341,87%,0.0,93.29,18132,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
79,"Nipah virus glycoprotein (G) gene, partial cds",Nipah henipavirus,1363,1363,52%,0.0,92.53,954,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
80,"Nipah henipavirus isolate MCL-19-BAT-574-7, pa...",Nipah henipavirus,1214,2088,79%,0.0,92.83,17815,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."
81,"Nipah henipavirus isolate MCL-19-BAT-574-5, pa...",Nipah henipavirus,1199,2433,91%,0.0,93.05,18131,"=HYPERLINK(""https://www.ncbi.nlm.nih.gov/nucle..."


In [7]:
def read_fasta(fName):
    
    lengths = []
    names = []
    seqs = []

    for seq_record in SeqIO.parse(fName, "fasta"):

        if "_" in seq_record.id:
            names.append(seq_record.id.split("_")[0])
        else:
            names.append(seq_record.id)
        seqs.append(seq_record.seq)
        lengths.append(len(seq_record))

    print(f"Found {len(seqs)} sequences in {os.path.basename(fName)}")
    
    res_dict = dict(zip(np.array(names), seqs))
    
    return dict(sorted(res_dict.items(), key=lambda x: x[0])), np.array(lengths)

In [8]:
G = read_fasta("sequences/PG/G_seqs.fasta")
P = read_fasta("sequences/PG/P_seqs.fasta")

Found 79 sequences in G_seqs.fasta
Found 79 sequences in P_seqs.fasta


# 2. Single nucleotide polymorphism (SNP) calling

## End goal is to convert the FASTA file into a VCF (variant calling format) file

<b>Amazing tutorial:</b> https://www.ebi.ac.uk/sites/ebi.ac.uk/files/content.ebi.ac.uk/materials/2014/140217_AgriOmics/dan_bolser_snp_calling.pdf

Adapted the following steps from it

<b></b>
```bash
bwa index sequences/genome/AY029768.1_MYS_CDS.fna
```

<b>Perform the alignment</b>
```bash
bwa aln sequences/genome/AY029768.1_MYS_CDS.fna sequences/PG/P_seqs.fasta > alignments/P_MYS_aln.sai
```

<b>Convert to SAM file format, which is human-readable</b>
```bash
bwa samse sequences/genome/AY029768.1_MYS_CDS.fna alignments/P_MYS_aln.sai sequences/PG/P_seqs.fasta > alignments/P_MYS_aln.sam
```

<b>Convert SAM to BAM and sort the BAM file</b>
```bash
   samtools view -b alignments/G_BGD_aln.sam > alignments/G_BGD_aln.bam
   samtools sort alignments/G_BGD_aln.bam -o alignments/G_BGD_aln_sorted.bam
```

<b>Index the genome file again with `samtools`</b>
```bash
samtools faidx sequences/genome/AY029768.1_MYS_CDS.fna
```

<b>Run 'mpileup' to generate VCF format</b>
```bash
bcftools mpileup -f sequences/genome/AY988601.1_BGD_CDS.fna alignments/P_MYS_aln_sorted.bam > alignments/P_MYS_aln.bcf
bcftools mpileup -f sequences/genome/AY988601.1_BGD_CDS.fna alignments/G_BGD_aln_sorted.bam > alignments/G_BGD_aln.bcf
```

<b>Call SNPs</b>
```bash
bcftools view -v snps alignments/P_MYS_aln.bcf > alignments/P_MYS_SNPs.vcf
bcftools view -v snps alignments/G_MYS_aln.bcf > alignments/G_MYS_SNPs.vcf
```

## Exact code to run:

```
bash align_make_vcf.sh "sequences/genome/AY988601.1_BGD_CDS.fna" "sequences/PG/P_seqs.fasta" "alignments" "P_BGD"
bash align_make_vcf.sh "sequences/genome/NC_00278.1_MYS_CDS.fna" "sequences/PG/P_seqs.fasta" "alignments" "P_MYS"

bash align_make_vcf.sh "sequences/genome/AY988601.1_BGD_CDS.fna" "sequences/PG/G_seqs.fasta" "alignments" "G_BGD"
bash align_make_vcf.sh "sequences/genome/NC_00278.1_MYS_CDS.fna" "sequences/PG/G_seqs.fasta" "alignments" "G_MYS"
```

# 3. Make trees using the G and P CDS's

```bash
fasttree -nt sequences/PG/G_seqs.fasta > trees/G_cds.nwk
fasttree -nt sequences/PG/P_seqs.fasta > trees/P_cds.nwk
```

Another reference: https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/01_alignment.html

# 4. Deduplicate and remake trees

In [9]:
G_seqs = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/G_seqs.fasta", "fasta")]
P_seqs = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/P_seqs.fasta", "fasta")]

## More sanity checks

In [10]:
G_ids = np.array(list(zip(*G_seqs))[0])
P_ids = np.array(list(zip(*P_seqs))[0])

# 76 / 79 sequences are the same. 3 are not
print(len(set(G_ids).intersection(P_ids)))

# Sequences with G only
print(set(G_ids) - set(P_ids))

# HM545087 is P only. MN549407 and MN549410 were the very low quality G sequences that were removed
# according to the entries, they are partial genomes isolated from bats, probably explains the low quality
print(set(P_ids) - set(G_ids))

76
{'AF238467', 'HM545086', 'AY858111'}
{'MN549407', 'MN549410', 'HM545087'}


In [11]:
seqs_G_df = pd.DataFrame(list(zip(*G_seqs))[-1]).rename(columns={0:"Seq"})
seqs_G_df["ID"] = G_ids

seqs_P_df = pd.DataFrame(list(zip(*P_seqs))[-1]).rename(columns={0:"Seq"})
seqs_P_df["ID"] = P_ids

# keeps only the first occurrence. So there are 46 unique glycoprotein sequences and 50 unique phosphoprotein sequences
len(seqs_G_df["Seq"].drop_duplicates()), len(seqs_P_df["Seq"].drop_duplicates())

(46, 50)

## The next cell writes the unique sequences to new Fasta files

In [179]:
# keep_G = seqs_G_df.drop_duplicates("Seq")
# keep_P = seqs_P_df.drop_duplicates("Seq")

# with open("sequences/PG/G_deduplicated.fasta", "w+") as file:
    
#     for _, row in keep_G.iterrows():
#         file.write(">" + row["ID"] + "\n")
#         file.write(row["Seq"] + "\n")
        
# with open("sequences/PG/P_deduplicated.fasta", "w+") as file:
    
#     for _, row in keep_P.iterrows():
#         file.write(">" + row["ID"] + "\n")
#         file.write(row["Seq"] + "\n")

In [181]:
G_dedup = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/G_deduplicated.fasta", "fasta")]
P_dedup = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/P_deduplicated.fasta", "fasta")]

```bash
fasttree -nt sequences/PG/G_deduplicated.fasta > trees/G_dedup.nwk
fasttree -nt sequences/PG/P_deduplicated.fasta > trees/P_dedup.nwk
```