In [1]:
import numpy as np
import pandas as pd

import Bio
from Bio import SeqIO
from Bio import Entrez
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
import re

In [2]:
#for seq_record in SeqIO.parse("sequences/genome/AY029768.1_MYS.fasta", "fasta"):
for seq_record in SeqIO.parse("sequences/genome/NC_00278.1_MYS.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Malaysian strain genome size: {len(seq_record)}")

Malaysian strain genome size: 18246


In [3]:
for seq_record in SeqIO.parse("sequences/genome/AY988601.1_BGD.fasta", "fasta"):
    
    # just wanted to do a sanity check
    print(f"Bangaldeshi strain genome size: {len(seq_record)}")

Bangaldeshi strain genome size: 18252


# 1. BLAST the glycoprotein and phosphoprotein sequences to get all sequences from NCBI

Used reference sequences `sequences/PG/ref_G_seq` and `sequences/PG/ref_P_seq` for the glycoprotein and phosphoproteins. Queried these on BLAST using <i>Henipaviruses</i> as the search organism. 

83 full search results for G (could be individual protein, multiple, or full genome) stored at `sequences/PG/glyco_blast_83.fasta`, and 80 full search results for P stored at `sequences/PG/phospho_blast_80.fasta`. Removed duplicates in Geneious.

Descriptions stored at `sequences/glyco_blast_descriptions.csv` and `sequences/phospho_blast_descriptions.csv`.

```bash
bowtie2-build sequences/genome/AY029768.1_MYS_CDS.fna MYS_cds
bowtie2 -x MYS_cds -f sequences/PG/G_seqs.fasta -S G_aln.sam
```

The sequence headers were updated, and the V, W, and C proteins were removed because they are duplicates of the phosphoprotein sequence. <b>This was found to cause problems when aligning the glycoprotein sequences</b>. It makes sense that you shouldn't have redundant sequences in a reference sequence, but I'm not sure why it didn't cause problems for the phosphoprotein alignment. 

In [203]:
def get_ncbi_accessions(id_list):
    
    #Entrez.email='skulkarni@g.harvard.edu'

    # search Genbank, returns accession numbers
    handle=Entrez.esearch(db='nucleotide', retmax=1000, term=",".join(id_list), idtype="acc") 
    record = Entrez.read(handle)
    
    handle.close()
    fetch = Entrez.efetch(db='nucleotide', id=",".join(record['IdList']), rettype='gb', retmode='text')
    gb=fetch.read()
    
    # the first one is an empty string because it's what comes before the first locus
    found_seq = list(gb.split("LOCUS"))[1:]
    print(f"Found {len(found_seq)} out of {len(id_list)} NCBI accessions!")
    
    # remove the sequences becuase they make the strings unnecessarily long
    found_seq = [isolate.split("FEATURES")[0] for isolate in found_seq]
    
    return found_seq

In [238]:
glyco_found = get_ncbi_accessions(G_tree_names)

Found 84 out of 84 NCBI accessions!


In [239]:
phospho_found = get_ncbi_accessions(P_tree_names)

Found 81 out of 81 NCBI accessions!


In [244]:
no_date_indices = [i for i in range(len(glyco_found)) if "VRL" not in glyco_found[i]]
print(no_date_indices)

no_date_indices = [i for i in range(len(phospho_found)) if "VRL" not in phospho_found[i]]
print(no_date_indices)

[]
[]


# 2. Single nucleotide polymorphism (SNP) calling

## End goal is to convert the FASTA file into a VCF (variant calling format) file

<b>Amazing tutorial:</b> https://www.ebi.ac.uk/sites/ebi.ac.uk/files/content.ebi.ac.uk/materials/2014/140217_AgriOmics/dan_bolser_snp_calling.pdf

## Exact code to run for both P and G aligning to both reference genomes:

```bash
bash align_make_vcf.sh "sequences/genome/AY988601.1_BGD_CDS.fna" "sequences/PG/P_seqs.fasta" "alignments" "P_BGD"
bash align_make_vcf.sh "sequences/genome/NC_00278.1_MYS_CDS.fna" "sequences/PG/P_seqs.fasta" "alignments" "P_MYS"

bash align_make_vcf.sh "sequences/genome/AY988601.1_BGD_CDS.fna" "sequences/PG/G_seqs.fasta" "alignments" "G_BGD"
bash align_make_vcf.sh "sequences/genome/NC_00278.1_MYS_CDS.fna" "sequences/PG/G_seqs.fasta" "alignments" "G_MYS"
```

The above bash script runs the following steps, adapted from the linked tutorial above:

<b></b>
```bash
bwa index sequences/genome/AY029768.1_MYS_CDS.fna
```

<b>Perform the alignment</b>
```bash
bwa aln sequences/genome/AY029768.1_MYS_CDS.fna sequences/PG/P_seqs.fasta > alignments/P_MYS_aln.sai
```

<b>Convert to SAM file format, which is human-readable</b>
```bash
bwa samse sequences/genome/AY029768.1_MYS_CDS.fna alignments/P_MYS_aln.sai sequences/PG/P_seqs.fasta > alignments/P_MYS_aln.sam
```

<b>Convert SAM to BAM and sort the BAM file</b>
```bash
   samtools view -b alignments/G_BGD_aln.sam > alignments/G_BGD_aln.bam
   samtools sort alignments/G_BGD_aln.bam -o alignments/G_BGD_aln_sorted.bam
```

<b>Index the genome file again with `samtools`</b>
```bash
samtools faidx sequences/genome/AY029768.1_MYS_CDS.fna
```

<b>Run 'mpileup' to generate VCF format</b>
```bash
bcftools mpileup -f sequences/genome/AY988601.1_BGD_CDS.fna alignments/P_MYS_aln_sorted.bam > alignments/P_MYS_aln.bcf
```

<b>Call SNPs</b>
```bash
bcftools view -v snps alignments/P_MYS_aln.bcf > alignments/P_MYS_SNPs.vcf
```

# 3. Make trees using the G and P CDS's

```bash
fasttree -nt sequences/PG/G_seqs.fasta > trees/G_cds.nwk
fasttree -nt sequences/PG/P_seqs.fasta > trees/P_cds.nwk
```

Another reference: https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionVI/lessons/01_alignment.html

# 4. Deduplicate and remake trees

## Then remove sequences where more than 1% of the nucleotides are N

In [17]:
# G_seqs = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/G_seqs.fasta", "fasta")]
# P_seqs = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse("sequences/PG/P_seqs.fasta", "fasta")]

def count_ambig_nuc(seq):
    
    count = 0
    for char in seq:
        if char == "N":
            count += 1
    return count

# ambig_G = [count_ambig_nuc(seq[1]) for seq in G_seqs]
# ambig_P = [count_ambig_nuc(seq[1]) for seq in P_seqs]

# print(np.unique(ambig_G) / len(G_seqs[0][1]))
# print(np.unique(ambig_P) / len(P_seqs[0][1]))

[0.         0.00221117 0.00718629 0.00773908 0.01050304 0.01547816
 0.04256495 0.08181316 0.11111111 0.11719182 0.18794914 0.22443339
 0.32172471 0.47263682]
[0.         0.00046948 0.0056338  0.00892019 0.01126761 0.08309859]


## More sanity checks

In [3]:
def deduplicate_seq(og_fasta):
    
    seqs = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse(og_fasta, "fasta")]
    
    seqs_df = pd.DataFrame(seqs).rename(columns={0:"ID", 1:"Seq"})

    # check that they are sorted alphabetically by ID
    assert sum(seqs_df.ID.values != np.sort(seqs_df.ID.values)) == 0

    # keeps only the first occurrence. So there are 46 unique glycoprotein sequences and 50 unique phosphoprotein sequences
    seqs_df = seqs_df.drop_duplicates(subset="Seq", keep="first")
    
#     for i, row in seqs_df.iterrows():
#         if count_ambig_nuc(row["Seq"]) / len(row["Seq"]) >= 0.01:
#             seqs_df.loc[i, "Ambig"] = 1

#     seqs_df["Ambig"] = seqs_df["Ambig"].fillna(0).astype(int)
#     return seqs_df.query("Ambig == 0")

    return seqs_df.loc[~seqs_df.Seq.str.contains("N")]

In [10]:
# using the 1% N thresholding leaves ~40 and 47 sequences, respectively

keep_G = deduplicate_seq("sequences/PG/G_seqs.fasta")
with open("seq_for_analysis/G_dedup.fasta", "w+") as file:
    
    for _, row in keep_G.iterrows():
        file.write(">" + row["ID"] + "\n")
        file.write(row["Seq"] + "\n")
        
keep_P = deduplicate_seq("sequences/PG/P_seqs.fasta")
with open("seq_for_analysis/P_dedup.fasta", "w+") as file:
    
    for _, row in keep_P.iterrows():
        file.write(">" + row["ID"] + "\n")
        file.write(row["Seq"] + "\n")
keep_G.shape, keep_P.shape

((37, 2), (45, 2))

In [49]:
a, b = keep_P.query("ID == 'MK673559' | ID == 'AF212302'").Seq.values

In [52]:
np.where(np.array(list(a)) != np.array(list(b)))

(array([548]),)

In [53]:
a[548]

'T'

In [54]:
b[548]

'G'

In [2]:
# convert PhyML bootstrap numbers to proportion (proportion of bootstrap replicates that support a split)
with open("trees/G_no_stop_codons_ML.nwk", "r") as file:
    
    lines = file.readlines()
    lines = [line.strip("\n") for line in lines]
    
if len(lines) == 1:
    lines = lines[0]
else:
    raise ValueError(f"More than one line in {phylip}!")
    
int_bootstrap = []
prop_bootstrap = []

# for line in lines.split(":"):
    
#     if ")" in line:
#         sep = line.split(")")
#         for string in sep:
#             if string.isnumeric():
#                 int_bootstrap.append(string)
#                 prop_bootstrap.append(str(int(string) / 1000))
                
# bs_conversion = dict(zip(int_bootstrap, prop_bootstrap))

# for key, value in bs_conversion.items():
#     lines = lines.replace(key, value)

In [54]:
oldlines_list = lines.split(":")
newlines = []

for i, line in enumerate(oldlines_list):
    
    if ")" in line:
        sep = line.split(")")
        for k, string in enumerate(sep):
            # excludes decimals and negative numbers. 
            # Put parentheses back in because the split function removes them
            if string.isnumeric():
                sep[k] = ")" + str(int(string) / 100)
        if i == 0:
            newlines.append("".join(sep))
        else:
            newlines.append(":" + "".join(sep))
    else:
        if i == 0:
            newlines.append(line)
        else:
            newlines.append(":" + line)
    
assert len(newlines) == len(oldlines_list)
"".join(newlines)[:-1]

'((((MK673565:0.00167553,((MK673583:0.00224310,((MK673570:0.00000001,MK673588:0.00055682)1.0:0.00167644,(MK673585:0.00111447,MK673592:0.00055662)0.81:0.00000001)0.95:0.00111663)0.99:0.00111829,(JN808857:0.00111633,(MK673576:0.00167603,(FJ513078:0.00223823,MK673578:0.00055758)0.09:0.00000001)0.79:0.00111608)0.76:0.00000001)0.79:0.00055790)0.67:0.00000001,((MN549409:0.00394243,MH396625:0.00110570)1.0:0.01055251,(AY858111:0.00907697,(FN869553:0.00277676,(AJ627196:0.00112211,(MK673562:0.00000001,(MK673559:0.00055917,(AF212302:0.00000001,(MK673560:0.00055903,(AJ564621:0.00000001,AF376747:0.00055939)0.99:0.00055912)0.99:0.00000001)0.98:0.00000001)0.98:0.00055876)0.94:0.00055841)0.93:0.00344790)0.95:0.00667832)1.0:0.06780222)0.69:0.00522145)0.04:0.00000001,(MK673582:0.00223733,MK673591:0.00279811)0.97:0.00000001)0.06:0.00000001,MW535746:0.00450058,((MK673571:0.00167909,(MK673568:0.00224397,(AY988601:0.00000001,((MK673567:0.00055948,MK673566:0.00055948)0.96:0.00112023,(MK673584:0.00224233,(JN8

In [50]:
lines

'((((MK673565:0.00167553,((MK673583:0.00224310,((MK673570:0.00000001,MK673588:0.00055682)100:0.00167644,(MK673585:0.00111447,MK673592:0.00055662)81:0.00000001)95:0.00111663)99:0.00111829,(JN808857:0.00111633,(MK673576:0.00167603,(FJ513078:0.00223823,MK673578:0.00055758)9:0.00000001)79:0.00111608)76:0.00000001)79:0.00055790)67:0.00000001,((MN549409:0.00394243,MH396625:0.00110570)100:0.01055251,(AY858111:0.00907697,(FN869553:0.00277676,(AJ627196:0.00112211,(MK673562:0.00000001,(MK673559:0.00055917,(AF212302:0.00000001,(MK673560:0.00055903,(AJ564621:0.00000001,AF376747:0.00055939)99:0.00055912)99:0.00000001)98:0.00000001)98:0.00055876)94:0.00055841)93:0.00344790)95:0.00667832)100:0.06780222)69:0.00522145)4:0.00000001,(MK673582:0.00223733,MK673591:0.00279811)97:0.00000001)6:0.00000001,MW535746:0.00450058,((MK673571:0.00167909,(MK673568:0.00224397,(AY988601:0.00000001,((MK673567:0.00055948,MK673566:0.00055948)96:0.00112023,(MK673584:0.00224233,(JN808864:0.00055914,(MK673589:0.00055864,(MK57

In [55]:
"".join(newlines)[:-1]+ ");"

'((((MK673565:0.00167553,((MK673583:0.00224310,((MK673570:0.00000001,MK673588:0.00055682)1.0:0.00167644,(MK673585:0.00111447,MK673592:0.00055662)0.81:0.00000001)0.95:0.00111663)0.99:0.00111829,(JN808857:0.00111633,(MK673576:0.00167603,(FJ513078:0.00223823,MK673578:0.00055758)0.09:0.00000001)0.79:0.00111608)0.76:0.00000001)0.79:0.00055790)0.67:0.00000001,((MN549409:0.00394243,MH396625:0.00110570)1.0:0.01055251,(AY858111:0.00907697,(FN869553:0.00277676,(AJ627196:0.00112211,(MK673562:0.00000001,(MK673559:0.00055917,(AF212302:0.00000001,(MK673560:0.00055903,(AJ564621:0.00000001,AF376747:0.00055939)0.99:0.00055912)0.99:0.00000001)0.98:0.00000001)0.98:0.00055876)0.94:0.00055841)0.93:0.00344790)0.95:0.00667832)1.0:0.06780222)0.69:0.00522145)0.04:0.00000001,(MK673582:0.00223733,MK673591:0.00279811)0.97:0.00000001)0.06:0.00000001,MW535746:0.00450058,((MK673571:0.00167909,(MK673568:0.00224397,(AY988601:0.00000001,((MK673567:0.00055948,MK673566:0.00055948)0.96:0.00112023,(MK673584:0.00224233,(JN8

In [48]:
line

'0.00054189);'