In [1]:
import numpy as np
import pandas as pd

import Bio
from Bio import SeqIO
from Bio import Entrez
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
import re

# PAML

Copied all `.ctl` files from the tar file for Linux/Mac <a href="http://abacus.gene.ucl.ac.uk/software/#downloads-and-installation-2" target="_blank">here</a>.

# HyPhy

## Data input format (easiest of the options):

Two separate files with the alignment and phylogeny each. 

Most standard alignment formats are accepted (FASTA, phylip, etc.), and the phylogeny should be Newick-formatted.

Launch with `hyphy -i`

Tutorial: http://hyphy.org/tutorials/CL-prompt-tutorial/

# Methods

Good explanations: https://stevenweaver.github.io/hyphy-site/methods/selection-methods

## BUSTED

BUSTED (Branch-Site Unrestricted Statistical Test for Episodic Diversification) provides a gene-wide (not site-specific) test for positive selection by asking whether a gene has experienced positive selection at at least one site on at least one branch.

<ul>
    <li>Can specify which branches to test or do all of them.</li>
    <li> It is gene-wide, not site-specific, so probably not desired for our purposes?</li>
</ul>


## FEL and SLAC

Infer dN and dS substitutions rates on a per-site basis. Assume that selection pressure for each site is constant along the entire phylogeny. This is probably not true in this case because the Nipah virus sequences come from different organisms with different immune behavior. <b>How do we account for this without a temporal analysis?</b>

SLAC uses maximum-likelihood and counting approaches. FEL is a fixed effects model.  

# Fasta files can not have stop codons in them, so generate new files for them

In [2]:
G_seq = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("hyphy/G_no_stop_codons.fasta", "fasta")]
P_seq = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("hyphy/P_no_stop_codons.fasta", "fasta")]
len(G_seq), len(P_seq)

(50, 50)

In [3]:
P_nonsense = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("hyphy/P_nonsense.fasta", "fasta")]
len(P_nonsense)

1

## More metadata: clade (country), organism, date, etc. to use FEL-Contrast

In [4]:
# supplemental table from the Whitmer et al paper with metadata
new_metadata = pd.read_excel("sequences/Supplemental_Table_1_V8.xlsx", header=3).iloc[:, 1:]

In [5]:
def get_ncbi_accessions(id_list):
    
    Entrez.email='skulkarni@g.harvard.edu'

    # search Genbank, returns accession numbers
    handle=Entrez.esearch(db='nucleotide', retmax=1000, term=",".join(id_list), idtype="acc") 
    record = Entrez.read(handle)
    
    handle.close()
    fetch = Entrez.efetch(db='nucleotide', id=",".join(record['IdList']), rettype='gb', retmode='text')
    gb=fetch.read()
    
    # the first one is an empty string because it's what comes before the first locus
    found_seq = list(gb.split("LOCUS"))[1:]
    print(f"Found {len(found_seq)} out of {len(id_list)} NCBI accessions!")
    
    # remove the sequences becuase they make the strings unnecessarily long
    found_seq = [isolate.split("/gene")[0] for isolate in found_seq]
    
    return found_seq

In [6]:
# (AF212302, Malaysia, 1999, human)
# (AJ564621, Malaysia, 2004, pig)
# (AJ564621, Malaysia, 2004, pig)
# (AY858111, Cambodia, 2004, bat)

In [7]:
countries_dict = dict(zip(['India', 'Kerala', 'Malaysia', 'Perak', 'Bangladesh', 'Singapore', 'Thailand', 'Cambodia', 'Kuala Lumpur'], 
                          ['India', 'India', 'Malaysia', 'Malaysia', 'Bangladesh', 'Singapore', 'Thailand', 'Cambodia', 'Malaysia']))

def extract_metadata(fasta_file, metadata_df):
    
    acc_ids = [seq_record.id for seq_record in SeqIO.parse(fasta_file, "fasta")]
    
    ncbi_info = get_ncbi_accessions(acc_ids)
    
    metadata = metadata_df.loc[metadata_df[" Accession Number:"].isin(acc_ids)][[" Accession Number:", 'Clade', 'Collection Date', 'Location, Country', "Original organism"]]
    metadata.rename(columns={metadata.columns[0]: "ID", 'Collection Date': 'Date', 'Location, Country': 'Country', "Original organism": "Host"}, inplace=True)
    
    # merge with the accession IDs to get a full metadata datafraem
    metadata = metadata.merge(pd.DataFrame(acc_ids).rename(columns={0:"ID"}), 
                              how="outer").sort_values("ID").reset_index(drop=True)
        
    for i, row in metadata.iterrows():

        # clean so that every word is separated by a single whitespace, make lowercase, then split to get a list
        cleaned_info = re.sub('\s+',' ', ncbi_info[i]).lower().split(" ")

        if pd.isnull(row["Country"]):

            # get the country by iterating through the list of countries
            for country in list(countries_dict.keys()):
                if country in ncbi_info[i]:
                    metadata.loc[i, "Country"] = country
                    break

        if pd.isnull(row["Date"]):
            
            found_date = False

            for string in cleaned_info:
                if "/collection_date=" in string:
                    metadata.loc[i, "Date"] = string.replace("/collection_date=", "").strip('""')
                    found_date = True
                    break
                if not found_date:            
                    # get the date by looking for the journal submission date
                    submitted_index = cleaned_info.index("submitted")
                    metadata.loc[i, "Date"] = cleaned_info[submitted_index+1].strip("()")

        if pd.isnull(row["Host"]):

            if "pteropus" in ncbi_info[i].lower():
                metadata.loc[i, "Host"] = "bat"
            elif "pig" in ncbi_info[i].lower() or "swine" in ncbi_info[i].lower():
                metadata.loc[i, "Host"] = "swine"
            elif "clinical" in ncbi_info[i].lower() or "homo sapiens" in ncbi_info[i].lower():
                metadata.loc[i, "Host"] = "human"

    # this whole genome is weird. Reference: https://www.sciencedirect.com/science/article/pii/S0042682201910268
    # 1999, so Malaysia, and text says it was isolated from human brain tissue.
    metadata.loc[metadata.ID == "AF212302", "Country"] = "Malaysia"
    metadata.loc[metadata.ID == "AF212302", "Host"] = "human"
    
    # for completeness and standardization, convert them all to strings
    metadata[['Date', 'Country', "Host"]] = metadata[['Date', 'Country', "Host"]].astype(str)
    
    # then convert the date column to a datetime object
    metadata["Date"] = pd.to_datetime(metadata["Date"])
    
    # check that there are no more NaNs in the columns we just imputed
    assert sum(pd.isnull(metadata['Date'])) == 0
    assert sum(pd.isnull(metadata['Country'])) == 0
    assert sum(pd.isnull(metadata['Host'])) == 0
    
    return metadata

In [8]:
G_metadata = extract_metadata("sequences/PG/G_seqs.fasta", new_metadata)
P_metadata = extract_metadata("sequences/PG/P_seqs.fasta", new_metadata)

Found 84 out of 84 NCBI accessions!
Found 81 out of 81 NCBI accessions!


# Look at sites under positive selection in the phosphoprotein: 285, 380, 421

In [9]:
def get_sites(lst_sites, fasta_file):
    '''
    Site list is 1-indexed, so decrement by 1 in Python searching. Sites are amino acid positions!
    '''
    seqs = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse(fasta_file, "fasta")]
    
    # translate to amino acid sequence
    aa = [Seq.translate(seq[1]) for seq in seqs]
    
    # initialize dataframe to store the results with the sequence IDs
    df_res = pd.DataFrame({"ID": list(zip(*seqs))[0]})
    
    for site in lst_sites:
        df_res[str(site)] = [seq[site-1] for seq in aa]
        
    return df_res

In [10]:
df_P = get_sites([285, 380, 421], "sequences/PG/P_seqs.fasta")
len(df_P)

81

In [11]:
P_metadata.head()

Unnamed: 0,ID,Clade,Date,Country,Host
0,AF212302,,1999-12-08,Malaysia,human
1,AF376747,,2001-05-04,Malaysia,bat
2,AJ564621,,2003-05-30,Kuala Lumpur,swine
3,AJ564622,,2003-05-30,Kuala Lumpur,swine
4,AJ564623,,2003-05-30,Kuala Lumpur,swine


In [13]:
P_summary = df_P.merge(P_metadata, on="ID", how="inner")
P_summary.shape

(81, 8)

In [15]:
P_summary

Unnamed: 0,ID,285,380,421,Clade,Date,Country,Host
0,AF212302,R,V,P,,1999-12-08,Malaysia,human
1,AF376747,R,V,P,,2001-05-04,Malaysia,bat
2,AJ564621,R,V,P,,2003-05-30,Kuala Lumpur,swine
3,AJ564622,R,V,P,,2003-05-30,Kuala Lumpur,swine
4,AJ564623,R,V,P,,2003-05-30,Kuala Lumpur,swine
...,...,...,...,...,...,...,...,...
76,MN549409,H,T,P,,2019-06-10,India,bat
77,MN549410,H,T,P,,2019-06-10,India,bat
78,MN549411,H,T,P,,2019-06-10,India,bat
79,MW535746,R,T,P,,2017-05-21,Thailand,bat


In [21]:
print(P_summary["285"].unique())
print(P_summary["380"].unique())
print(P_summary["421"].unique())

['R' 'H']
['V' 'T' 'A']
['P' 'L']


# See if the alleles associate with clades, countries, organisms, etc.