In [1]:
import numpy as np
import pandas as pd
from Bio import AlignIO, SeqIO

# PAML

Copied all `.ctl` files from the tar file for Linux/Mac <a href="http://abacus.gene.ucl.ac.uk/software/#downloads-and-installation-2" target="_blank">here</a>.

# HyPhy

## Data input format (easiest of the options):

Two separate files with the alignment and phylogeny each. 

Most standard alignment formats are accepted (FASTA, phylip, etc.), and the phylogeny should be Newick-formatted.

Launch with `hyphy -i`

Tutorial: http://hyphy.org/tutorials/CL-prompt-tutorial/

# Methods

Good explanations: https://stevenweaver.github.io/hyphy-site/methods/selection-methods

## BUSTED

BUSTED (Branch-Site Unrestricted Statistical Test for Episodic Diversification) provides a gene-wide (not site-specific) test for positive selection by asking whether a gene has experienced positive selection at at least one site on at least one branch.

<ul>
    <li>Can specify which branches to test or do all of them.</li>
    <li> It is gene-wide, not site-specific, so probably not desired for our purposes?</li>
</ul>


## FEL and SLAC

Infer dN and dS substitutions rates on a per-site basis. Assume that selection pressure for each site is constant along the entire phylogeny. This is probably not true in this case because the Nipah virus sequences come from different organisms with different immune behavior. <b>How do we account for this without a temporal analysis?</b>

SLAC uses maximum-likelihood and counting approaches. FEL is a fixed effects model.  

# Fasta files can not have stop codons in them, so generate new files for them

In [5]:
G_seq = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("G_no_stop_codons.fasta", "fasta")]
P_seq = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("P_no_stop_codons.fasta", "fasta")]

In [66]:
P_nonsense = [(seq_record.id, seq_record.seq) for seq_record in SeqIO.parse("P_nonsense.fasta", "fasta")][0]

In [60]:
len("MEPDIKSISSESMEGVSDFSPSSWENGGYLDKVEPEIDENGSMIPKYKIYTPGANERKYNNYMYLICYGFVEDVERTPETGKRKKIRTIAAYPLGVGKSASHPQDLLEELCSLKVTVRRTAGSTEKVVFGSSGPLNHLVPWKKVLTGGSIFNAVKVCRNVDQIQLDKHQALRIFFLSITKLNDSGIYMIPRTMLEFRRNNAIAFNLLVYLKIDADLSKMGIQGSLDKDGFKVASFMLHLGNFVRRAGKYYSVDYCRRKIDRMKLQFSLGSIGGLSLHIKINGVISKRLFAQMGFQKNLCFSLMDINPWLNRLTWNNSCEISRVAAVLQPSVPREFMIYDDVFIDNTGRILKG")

352

In [61]:
list(list(zip(*P_seq))[0]).index("MK575063")

14

In [62]:
len("ATGGATAAATTGGAACTAGTTAATGATGGCCTCAATATTATTGACTTTATTCAGAAGAACCAAAAAGAAATACAGAAGACATACGGACGATCAAGCATTCAACAACCCAGCATCAAAGACCGAACAAAAGCATGGGAGGATTTTCTGCAGTGCACCAGTGGAGAATCTGAACAAGTTGAGAGGGGAATGTCTAAGGATGATGGAGGTGTTGAAAGAAGAAGCTTGGAGGATCTATCCAGTGCTTCTCCCACAGATGGAACTATTGGAAAAAGAGTGTCGAACACCCGTGACTGGGCAGAAGGTTCAGATGACATACAACTGGACCCAGTGGTTACAGACGTTGTATACCATGATCATGGAGGAGAATGTACCGGATATGGATTTACTTCAAGCCCTGAGAGAGGGTGGAGTGATCACTCATCAGGAGCAAACAATGGGGATGTATGTCTTGTATCTGATGCAAAGGTGCTGTCCTATGCTCCCGAAATTGCAGTTTCTAAAGAAGATCGGGAAACTGATCTAGTTCACCTTGAGGACAAACTATCTGCTACAGGACTGAACCCTACAGCAATACCATTCACTCCAAAAAATCTGTCTGTTCCTGCAAAAGATTCTCCTGTGATTGCTGAACACTACTACGGGCTAGGAGTTAGAGAGCAAAACGTTGATCCCCAGACTAACAGAAATGTCAATTTGGACAGCATCAAATTGTACACATCAGATGACGAAGAGGCAGACCAGCTCGAATTTGAGGATGAGTTTGCAGGGAGTTCAAGTGAAGTGATAGTCGGCATTTCTCCTGAAGAGGAAGAGCCTTCAAGTGCTGGCAGGAAACCTATTGAATCCGTTGGACATATAATTGAGGGCCAGTCAACTCGAGACAGCCTCCAAATTAAGGACAACAAGCCGGCAGATGCACCAGGAGCAGGACCGAGAGATTCGGCAGTGAAGGAAAAATCACCCCAGAAGAGGCTGCCTATGTTAGCAGAAGAGTTTGAATGCTCTGGATCTGAAGACCCAATTATCCAAGAGTTGCTGAAAGAGAATTCATTCATAAATAGTCAACAAGGG")

1071

In [63]:
1071/3

357.0

In [37]:
#import re
#list(re.finditer("TAA", str(P_seq[14][1])))

In [42]:
n = 3
codons = [str(P_seq[14][1])[i:i+n] for i in range(0, len(P_seq[14][1]), n)]

In [44]:
np.where(codons=="TAA")

(array([], dtype=int64),)

In [46]:
codons.index("TAA")

357

In [47]:
357*3

1071

In [48]:
P_seq[14][1][1071:1074]

Seq('TAA')

In [56]:
len(str(P_seq[14][1])[:1071] + "-"*(2127-1071))

2127

In [57]:
str(P_seq[14][1])[:1071] + "-"*(2127-1071)

'ATGGATAAATTGGAACTAGTTAATGATGGCCTCAATATTATTGACTTTATTCAGAAGAACCAAAAAGAAATACAGAAGACATACGGACGATCAAGCATTCAACAACCCAGCATCAAAGACCGAACAAAAGCATGGGAGGATTTTCTGCAGTGCACCAGTGGAGAATCTGAACAAGTTGAGAGGGGAATGTCTAAGGATGATGGAGGTGTTGAAAGAAGAAGCTTGGAGGATCTATCCAGTGCTTCTCCCACAGATGGAACTATTGGAAAAAGAGTGTCGAACACCCGTGACTGGGCAGAAGGTTCAGATGACATACAACTGGACCCAGTGGTTACAGACGTTGTATACCATGATCATGGAGGAGAATGTACCGGATATGGATTTACTTCAAGCCCTGAGAGAGGGTGGAGTGATCACTCATCAGGAGCAAACAATGGGGATGTATGTCTTGTATCTGATGCAAAGGTGCTGTCCTATGCTCCCGAAATTGCAGTTTCTAAAGAAGATCGGGAAACTGATCTAGTTCACCTTGAGGACAAACTATCTGCTACAGGACTGAACCCTACAGCAATACCATTCACTCCAAAAAATCTGTCTGTTCCTGCAAAAGATTCTCCTGTGATTGCTGAACACTACTACGGGCTAGGAGTTAGAGAGCAAAACGTTGATCCCCAGACTAACAGAAATGTCAATTTGGACAGCATCAAATTGTACACATCAGATGACGAAGAGGCAGACCAGCTCGAATTTGAGGATGAGTTTGCAGGGAGTTCAAGTGAAGTGATAGTCGGCATTTCTCCTGAAGAGGAAGAGCCTTCAAGTGCTGGCAGGAAACCTATTGAATCCGTTGGACATATAATTGAGGGCCAGTCAACTCGAGACAGCCTCCAAATTAAGGACAACAAGCCGGCAGATGCACCAGGAGCAGGACCGAGAGATTCGGCAGTGAAGGAAAAATCACCCCAGAAGAGGCTGCCTATGTTAGCAGAAGAGTTTGAA

In [38]:
1054/3

351.3333333333333

In [25]:
len(P_seq[14][1])

2127

In [None]:
hyphy/P_no_stop_codons.fasta

In [8]:
for seq in P_seq:
    print(len(seq[1]))

2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127
2127


In [14]:
with open("../sequences/PG/P_deduplicated.fasta", "r") as og_file:
    
    with open("P_no_stop_codons.fasta", "w+") as new_file:
    
        for line in og_file:

            if ">" in line:
                new_file.write(line)
            else:
                
                # last character is new line, so don't include that and the last 3 nucleotides
                new_file.write(line[:-4] + "\n")

In [None]:
lengths = []

with open("P_no_stop_codons.fasta", "r") as file:
    
    for line in file:
        if ">" not in line:
            lengths.append(len(line)-1)
            
# should be 1806 for G, 2127 for P
np.unique(lengths)

array([2127])

array([1806])