In [1]:
import numpy as np
import pandas as pd

import Bio
from Bio import SeqIO
from Bio import Entrez
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
import re

# PAML

Copied all `.ctl` files from the tar file for Linux/Mac <a href="http://abacus.gene.ucl.ac.uk/software/#downloads-and-installation-2" target="_blank">here</a>.

# HyPhy

## Data input format (easiest of the options):

Two separate files with the alignment and phylogeny each. 

Most standard alignment formats are accepted (FASTA, phylip, etc.), and the phylogeny should be Newick-formatted.

Launch with `hyphy -i`

Tutorial: http://hyphy.org/tutorials/CL-prompt-tutorial/

# Methods

Good explanations: https://stevenweaver.github.io/hyphy-site/methods/selection-methods

## BUSTED

BUSTED (Branch-Site Unrestricted Statistical Test for Episodic Diversification) provides a gene-wide (not site-specific) test for positive selection by asking whether a gene has experienced positive selection at at least one site on at least one branch.

<ul>
    <li>Can specify which branches to test or do all of them.</li>
    <li> It is gene-wide, not site-specific, so probably not desired for our purposes?</li>
</ul>


## FEL and SLAC

Infer dN and dS substitutions rates on a per-site basis. Assume that selection pressure for each site is constant along the entire phylogeny. This is probably not true in this case because the Nipah virus sequences come from different organisms with different immune behavior. <b>How do we account for this without a temporal analysis?</b>

SLAC uses maximum-likelihood and counting approaches. FEL is a fixed effects model.  

# Fasta files can not have stop codons in them, so generate new files for them

In [None]:
G_seq = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("hyphy/G_no_stop_codons.fasta", "fasta")]
P_seq = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("hyphy/P_no_stop_codons.fasta", "fasta")]

In [None]:
P_nonsense = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("P_nonsense.fasta", "fasta")][0]

## More metadata: clade (country), organism, date, etc. to use FEL-Contrast

In [2]:
# supplemental table from the Whitmer et al paper with metadata
new_metadata = pd.read_excel("sequences/Supplemental_Table_1_V8.xlsx", header=3).iloc[:, 1:]

In [3]:
def get_ncbi_accessions(id_list):
    
    Entrez.email='skulkarni@g.harvard.edu'

    # search Genbank, returns accession numbers
    handle=Entrez.esearch(db='nucleotide', retmax=1000, term=",".join(id_list), idtype="acc") 
    record = Entrez.read(handle)
    
    handle.close()
    fetch = Entrez.efetch(db='nucleotide', id=",".join(record['IdList']), rettype='gb', retmode='text')
    gb=fetch.read()
    
    # the first one is an empty string because it's what comes before the first locus
    found_seq = list(gb.split("LOCUS"))[1:]
    print(f"Found {len(found_seq)} out of {len(id_list)} NCBI accessions!")
    
    # remove the sequences becuase they make the strings unnecessarily long
    found_seq = [isolate.split("FEATURES")[0] for isolate in found_seq]
    
    return found_seq

In [5]:
# (AF212302, Bangladesh, 2001, human)
# (AJ564621, Malaysia, 2004, pig)
# (AJ564621, Malaysia, 2004, pig)
# (AY858111, Cambodia, 2004, bat)

In [6]:
countries_dict = dict(zip(['India', 'Kerala', 'Malaysia', 'Perak', 'Bangladesh', 'Singapore', 'Thailand', 'Cambodia', 'Kuala Lumpur'], 
                          ['India', 'India', 'Malaysia', 'Malaysia', 'Bangladesh', 'Singapore', 'Thailand', 'Cambodia', 'Malaysia']))

def get_countries(metadata_df, isolate_lst, ref_lst):
    
    found_isolates = []
    countries = []
    
    for i, row in metadata_df.iterrows():

        if pd.isnull(row["Location, Country"]):
            
            # clean so that every word is separated by a single whitespace, then split to get a list
            cleaned_info = re.sub('\s+',' ', isolate_lst[i]).split(" ")

            found = False
            
            for country in list(countries_dict.keys()):
                if country in isolate_lst[i]:
                    countries.append(countries_dict[country])
                    found_isolates.append(cleaned_info[1])
                    found = True
                    break
                    
            if not found:
                print(cleaned_info[1])
        
    return dict(zip(found_isolates, countries))

In [33]:
def get_dates(metadata_df, isolate_lst, ref_lst):
    
    found_isolates = []
    dates = []
    
    for i, row in metadata_df.iterrows():

        if pd.isnull(row["Collection Date"]):
            
            # clean so that every word is separated by a single whitespace, then split to get a list
            cleaned_info = re.sub('\s+',' ', isolate_lst[i]).split(" ")

            submitted_index = cleaned_info.index("Submitted")
            dates.append(cleaned_info[submitted_index+1])
            found_isolates.append(ref_lst[i])
                            
    return dict(zip(found_isolates, dates))

In [36]:
def extract_metadata(fasta_file, metadata_df):
    
    acc_ids = [seq_record.id for seq_record in SeqIO.parse(fasta_file, "fasta")]
    
    ncbi_info = get_ncbi_accessions(acc_ids)
    
    metadata = metadata_df.loc[metadata_df[" Accession Number:"].isin(acc_ids)][[" Accession Number:", 'Clade', 'Collection Date', 'Location, Country']]
    
    # merge with the accession IDs to get a full metadata datafraem
    metadata = metadata.merge(pd.DataFrame(acc_ids).rename(columns={0:" Accession Number:"}), 
                              how="outer").sort_values(" Accession Number:").reset_index(drop=True)

    countries_full = get_countries(metadata, ncbi_info, acc_ids)
    countries_full["AF212302"] = "Bangladesh"
    metadata["Location, Country"] = metadata["Location, Country"].fillna(metadata[" Accession Number:"].map(countries_full))
    
    dates_full = get_dates(metadata, ncbi_info, acc_ids)
    metadata["Collection Date"] = metadata["Collection Date"].fillna(metadata[" Accession Number:"].map(dates_full))
    
    return metadata

In [39]:
G_metadata = extract_metadata("sequences/PG/G_deduplicated.fasta", new_metadata)
P_metadata = extract_metadata("sequences/PG/P_deduplicated.fasta", new_metadata)

Found 50 out of 50 NCBI accessions!
AF212302
Found 51 out of 51 NCBI accessions!
AF212302


In [40]:
G_metadata

Unnamed: 0,Accession Number:,Clade,Collection Date,"Location, Country"
0,AF212302,,(08-DEC-1999),Bangladesh
1,AF376747,,(04-MAY-2001),Malaysia
2,AJ564621,,(30-MAY-2003),Malaysia
3,AJ627196,,(10-FEB-2004),Malaysia
4,AY858111,,(15-DEC-2004),Cambodia
5,AY988601,,(29-MAR-2005),Bangladesh
6,FJ513078,,(01-DEC-2008),India
7,FN869553,,(22-MAY-2010),Malaysia
8,JF899340,,(28-APR-2011),India
9,JN808857,,(03-OCT-2011),Bangladesh


In [41]:
P_metadata

Unnamed: 0,Accession Number:,Clade,Collection Date,"Location, Country"
0,AF212302,,(08-DEC-1999),Bangladesh
1,AF376747,,(04-MAY-2001),Malaysia
2,AJ627196,,(10-FEB-2004),Malaysia
3,AY988601,,(29-MAR-2005),Bangladesh
4,FJ513078,,(01-DEC-2008),India
5,FN869553,,(22-MAY-2010),Malaysia
6,JN808857,,(03-OCT-2011),Bangladesh
7,JN808864,,(03-OCT-2011),Bangladesh
8,MH396625,,(25-MAY-2018),India
9,MH523642,,(23-JUN-2018),India
