In [1]:
import pandas as pd
import numpy as np
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
import re
import matplotlib.pyplot as plt
import seaborn as sns

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
species = []
gene = []
seq = []

with open('all_chordate_nucleotide.fasta') as infile:
    for line in infile:
        if line[0] == '>':
            tab_split = line.split(' ')
            species.append(tab_split[1].strip())
            gene.append(tab_split[0][1:])
        if re.match('\w', line) != None:
            seq.append(line.strip())

In [3]:
NucData = pd.DataFrame({'Species':species, 'Gene':gene, 'NucSeq':seq})

In [4]:
NucData.head()

Unnamed: 0,Species,Gene,NucSeq
0,Cataetyx_rubrirostris,ND1,ATGATATCCGCCCTTACCACTCACATCATTAACCCCCTTGCCTACA...
1,Cataetyx_rubrirostris,ND2,ATGAACCCCTACATTTTATCCACCCTATTATTTGGACTAGGCCTAG...
2,Cataetyx_rubrirostris,COX1,GTGGCAATCACACGCTGATTTTTCTCGACAAACCACAAAGATATTG...
3,Cataetyx_rubrirostris,COX2,ATGGCACATCCCTCACAACTAGGTTTCCAAGACGCGGCCTCACCCG...
4,Cataetyx_rubrirostris,ATP8,ATGCCTCAGCTCAACCCCGCCCCCTGACTTGCCATCCTCATCTTCT...


In [5]:
species = []
gene = []
seq = []

with open('all_chordate_aminoacids.fasta') as infile:
    for line in infile:
        if line[0] == '>':
            tab_split = line.split(' ')
            species.append(tab_split[1].strip())
            gene.append(tab_split[0][1:])
        if re.match('\w', line) != None:
            seq.append(line.strip())

In [6]:
AminoData = pd.DataFrame({'Species':species, 'Gene':gene, 'AminoSeq':seq})

In [7]:
AminoData.head()

Unnamed: 0,Species,Gene,AminoSeq
0,Cataetyx_rubrirostris,ND1,MMSALTTHIINPLAYIVPVLLAVAFLTLLERKVLGYMQLRKGPNVV...
1,Cataetyx_rubrirostris,ND2,MNPYILSTLLFGLGLGTTITFASTHWLLAWMGLEINTLAIIPLMAQ...
2,Cataetyx_rubrirostris,COX1,MAITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGS...
3,Cataetyx_rubrirostris,COX2,MAHPSQLGFQDAASPVMEELLHFHDHALMIVFLISTLVLYIIVATV...
4,Cataetyx_rubrirostris,ATP8,MPQLNPAPWLAILIFSWLVFTTVMPPKILAHTFPNEPTTQSTEKPK...


In [8]:
df = NucData.merge(AminoData, how='inner')

In [9]:
df.shape

(51523, 4)

In [10]:
df.head()

Unnamed: 0,Species,Gene,NucSeq,AminoSeq
0,Cataetyx_rubrirostris,ND1,ATGATATCCGCCCTTACCACTCACATCATTAACCCCCTTGCCTACA...,MMSALTTHIINPLAYIVPVLLAVAFLTLLERKVLGYMQLRKGPNVV...
1,Cataetyx_rubrirostris,ND2,ATGAACCCCTACATTTTATCCACCCTATTATTTGGACTAGGCCTAG...,MNPYILSTLLFGLGLGTTITFASTHWLLAWMGLEINTLAIIPLMAQ...
2,Cataetyx_rubrirostris,COX1,GTGGCAATCACACGCTGATTTTTCTCGACAAACCACAAAGATATTG...,MAITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGS...
3,Cataetyx_rubrirostris,COX2,ATGGCACATCCCTCACAACTAGGTTTCCAAGACGCGGCCTCACCCG...,MAHPSQLGFQDAASPVMEELLHFHDHALMIVFLISTLVLYIIVATV...
4,Cataetyx_rubrirostris,ATP8,ATGCCTCAGCTCAACCCCGCCCCCTGACTTGCCATCCTCATCTTCT...,MPQLNPAPWLAILIFSWLVFTTVMPPKILAHTFPNEPTTQSTEKPK...


In [12]:
TranslatedSeq = []
for i in range(df.shape[0]):
    TranslatedCodons = []
    if df.AminoSeq[i][1] == 'M' and df.NucSeq[i][:3] not in ['ATA', 'ATG']:
        df.AminoSeq[i] = df.AminoSeq[i][1:]
    for j in range(len(df.AminoSeq[i])):
        codon = Seq(df.AminoSeq[i][j:j+3], IUPAC.ambiguous_dna)
        try:
            a = codon.translate(table="Vertebrate Mitochondrial")
            if a == df.AminoSeq[i][j]:
                TranslatedCodons.append(a)
            else:
                TranslatedCodons.append('---')
        except:
            TranslatedCodons.append('---')
        TranslatedSeq.append(TranslatedCodons)



In [36]:
nuc = df.NucSeq[1]
amino = df.AminoSeq[1]

In [37]:
len(nuc) == 3*len(amino)

True

In [38]:
codons = []
for i in range(0, len(nuc), 3):
    codons.append(nuc[i:i+3])

In [39]:
TrueCodons = []
for i in range(len(codons)):
    codon = Seq(codons[i], IUPAC.ambiguous_dna)
    TranslatedCodon = codon.translate(table="Vertebrate Mitochondrial")
    if TranslatedCodon == amino[i]:
        TrueCodons.append(codons[i])

In [51]:
a = ['.', '.', '.']
del a[0]
a

['.', '.']

In [84]:
nuc = df.NucSeq[2]
amino = df.AminoSeq[2]
codons = []
print(amino)
for i in range(0, len(nuc), 3):
    codons.append(nuc[i:i+3])
TrueCodons = []
if amino[0] == 'M' and codons[0] not in ['ATA', 'ATG']:
    amino = amino[1:]
    del codons[0]
    print(len(amino))
for i in range(len(amino)):
    codon = Seq(codons[i], IUPAC.ambiguous_dna)
    if 'N' in codon:
        TrueCodons.append(codons[i])
        continue
    TranslatedCodon = codon.translate(table="Vertebrate Mitochondrial")
    if TranslatedCodon == amino[i]:
        TrueCodons.append(codons[i])
len(TrueCodons)

MAITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGSLLGDDQIYNVIVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEAGAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAISQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHVVAYYSGKKEPFGYMGMVWAMMAIGLLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWDTPLLWALGFIFLFTVGGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAAFVHWFPLFTGYTLHSAWTKIHFGVMFAGVNLTFFPQHFLGLAGMPRRYSDYPDAYTLWNTVSSIGSLISLLAVIMFLFILWEAFAAKREVMSVELTSTNVEWLHGCPPPYHTFEEPAFVQVQTN
515


515

In [85]:
def check_codons(nuc, amino):
    codons = []
    for i in range(0, len(nuc), 3):
        codons.append(nuc[i:i+3])
    TrueCodons = []
    if amino[0] == 'M' and codons[0] not in ['ATA', 'ATG']:
        mod_amino = amino[1:]
        del codons[0]
    else:
        mod_amino = amino
    for i in range(len(mod_amino)):
        codon = Seq(codons[i], IUPAC.ambiguous_dna)
        if 'N' in codon:
            TrueCodons.append(codons[i])
            continue
        TranslatedCodon = codon.translate(table="Vertebrate Mitochondrial")
        if TranslatedCodon == mod_amino[i]:
            TrueCodons.append(codons[i])
    return(TrueCodons, mod_amino)

In [86]:
df['ModifiedAmino'] = [i for i in range(df.shape[0])]

In [90]:
Codons = []
Quality = []
mod_aa = []
for i in range(df.shape[0]):
    try:
        a, aa = check_codons(df.NucSeq[i], df.AminoSeq[i])
        Quality.append(float(len(a) / len(aa)))
        Codons.append(''.join(a))
        mod_aa.append(aa)
    except IndexError:
        Codons.append('')
        Quality.append(int(0))
        mod_aa.append(aa)

In [91]:
df['Codons'] = Codons
df['Quality'] = Quality
df['ModifiedAmino'] = mod_aa

In [92]:
df.head()

Unnamed: 0,Species,Gene,NucSeq,AminoSeq,Codons,Quality,ModifiedAmino
0,Cataetyx_rubrirostris,ND1,ATGATATCCGCCCTTACCACTCACATCATTAACCCCCTTGCCTACA...,MMSALTTHIINPLAYIVPVLLAVAFLTLLERKVLGYMQLRKGPNVV...,ATGATATCCGCCCTTACCACTCACATCATTAACCCCCTTGCCTACA...,1.0,MMSALTTHIINPLAYIVPVLLAVAFLTLLERKVLGYMQLRKGPNVV...
1,Cataetyx_rubrirostris,ND2,ATGAACCCCTACATTTTATCCACCCTATTATTTGGACTAGGCCTAG...,MNPYILSTLLFGLGLGTTITFASTHWLLAWMGLEINTLAIIPLMAQ...,ATGAACCCCTACATTTTATCCACCCTATTATTTGGACTAGGCCTAG...,1.0,MNPYILSTLLFGLGLGTTITFASTHWLLAWMGLEINTLAIIPLMAQ...
2,Cataetyx_rubrirostris,COX1,GTGGCAATCACACGCTGATTTTTCTCGACAAACCACAAAGATATTG...,MAITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGS...,GCAATCACACGCTGATTTTTCTCGACAAACCACAAAGATATTGGCA...,1.0,AITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGSL...
3,Cataetyx_rubrirostris,COX2,ATGGCACATCCCTCACAACTAGGTTTCCAAGACGCGGCCTCACCCG...,MAHPSQLGFQDAASPVMEELLHFHDHALMIVFLISTLVLYIIVATV...,ATGGCACATCCCTCACAACTAGGTTTCCAAGACGCGGCCTCACCCG...,1.0,MAHPSQLGFQDAASPVMEELLHFHDHALMIVFLISTLVLYIIVATV...
4,Cataetyx_rubrirostris,ATP8,ATGCCTCAGCTCAACCCCGCCCCCTGACTTGCCATCCTCATCTTCT...,MPQLNPAPWLAILIFSWLVFTTVMPPKILAHTFPNEPTTQSTEKPK...,ATGCCTCAGCTCAACCCCGCCCCCTGACTTGCCATCCTCATCTTCT...,1.0,MPQLNPAPWLAILIFSWLVFTTVMPPKILAHTFPNEPTTQSTEKPK...


In [93]:
df.Quality.value_counts()

1.000000    51216
0.994220        9
0.000000        8
0.989011        5
0.988938        5
0.982301        3
0.991453        3
0.937500        3
0.966667        3
0.995595        3
0.982906        3
0.997368        3
0.977778        2
0.994012        2
0.946154        2
0.996183        2
0.993610        2
0.940397        2
0.979769        2
0.988235        2
0.998058        2
0.974359        2
0.995614        2
0.982659        2
0.960352        2
0.938356        2
0.986622        2
0.950000        2
0.998054        2
0.998363        2
            ...  
0.129310        1
0.983471        1
0.965116        1
0.966216        1
0.924419        1
0.946341        1
0.964286        1
0.928177        1
0.972603        1
0.956376        1
0.947674        1
0.950495        1
0.943750        1
0.969298        1
0.997361        1
0.940299        1
0.938811        1
0.914062        1
0.947883        1
0.978641        1
0.964444        1
0.099071        1
0.984899        1
0.986607        1
0.988506  

In [94]:
df.to_csv('AllGenesCodons.csv', sep='\t')

In [95]:
df

Unnamed: 0,Species,Gene,NucSeq,AminoSeq,Codons,Quality,ModifiedAmino
0,Cataetyx_rubrirostris,ND1,ATGATATCCGCCCTTACCACTCACATCATTAACCCCCTTGCCTACA...,MMSALTTHIINPLAYIVPVLLAVAFLTLLERKVLGYMQLRKGPNVV...,ATGATATCCGCCCTTACCACTCACATCATTAACCCCCTTGCCTACA...,1.0,MMSALTTHIINPLAYIVPVLLAVAFLTLLERKVLGYMQLRKGPNVV...
1,Cataetyx_rubrirostris,ND2,ATGAACCCCTACATTTTATCCACCCTATTATTTGGACTAGGCCTAG...,MNPYILSTLLFGLGLGTTITFASTHWLLAWMGLEINTLAIIPLMAQ...,ATGAACCCCTACATTTTATCCACCCTATTATTTGGACTAGGCCTAG...,1.0,MNPYILSTLLFGLGLGTTITFASTHWLLAWMGLEINTLAIIPLMAQ...
2,Cataetyx_rubrirostris,COX1,GTGGCAATCACACGCTGATTTTTCTCGACAAACCACAAAGATATTG...,MAITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGS...,GCAATCACACGCTGATTTTTCTCGACAAACCACAAAGATATTGGCA...,1.0,AITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGSL...
3,Cataetyx_rubrirostris,COX2,ATGGCACATCCCTCACAACTAGGTTTCCAAGACGCGGCCTCACCCG...,MAHPSQLGFQDAASPVMEELLHFHDHALMIVFLISTLVLYIIVATV...,ATGGCACATCCCTCACAACTAGGTTTCCAAGACGCGGCCTCACCCG...,1.0,MAHPSQLGFQDAASPVMEELLHFHDHALMIVFLISTLVLYIIVATV...
4,Cataetyx_rubrirostris,ATP8,ATGCCTCAGCTCAACCCCGCCCCCTGACTTGCCATCCTCATCTTCT...,MPQLNPAPWLAILIFSWLVFTTVMPPKILAHTFPNEPTTQSTEKPK...,ATGCCTCAGCTCAACCCCGCCCCCTGACTTGCCATCCTCATCTTCT...,1.0,MPQLNPAPWLAILIFSWLVFTTVMPPKILAHTFPNEPTTQSTEKPK...
5,Cataetyx_rubrirostris,ATP6,ATGACATTAAGCTTCTTTGACCAATTTATGAGCCCCACATTTTTAG...,MTLSFFDQFMSPTFLGIPLMALALSLPWILFPAPTTRWLNNRLLTL...,ATGACATTAAGCTTCTTTGACCAATTTATGAGCCCCACATTTTTAG...,1.0,MTLSFFDQFMSPTFLGIPLMALALSLPWILFPAPTTRWLNNRLLTL...
6,Cataetyx_rubrirostris,COX3,ATGGCCCATCAAGCACACGCATACCACATAGTTGACCCCAGCCCCT...,MAHQAHAYHMVDPSPWPLSGAIAALLMTSGLAIWFHFHSSTLLTLG...,ATGGCCCATCAAGCACACGCATACCACATAGTTGACCCCAGCCCCT...,1.0,MAHQAHAYHMVDPSPWPLSGAIAALLMTSGLAIWFHFHSSTLLTLG...
7,Cataetyx_rubrirostris,ND3,ATGAATTTAACCACAACCGTCCTTATCATCACTGCCCTACTCTCCG...,MNLTTTVLIITALLSAILATVSFWLPQITPDHEKLSPYECGFDPVG...,ATGAATTTAACCACAACCGTCCTTATCATCACTGCCCTACTCTCCG...,1.0,MNLTTTVLIITALLSAILATVSFWLPQITPDHEKLSPYECGFDPVG...
8,Cataetyx_rubrirostris,ND4L,ATGACCCCCGCCCATTTCGCCTTCTCATCAGCCTTTGCCCTAGGCC...,MTPAHFAFSSAFALGLTGLAFHRTHLLSALLCLEGMMLSLFIALSL...,ATGACCCCCGCCCATTTCGCCTTCTCATCAGCCTTTGCCCTAGGCC...,1.0,MTPAHFAFSSAFALGLTGLAFHRTHLLSALLCLEGMMLSLFIALSL...
9,Cataetyx_rubrirostris,ND4,ATGCTAAAAATCCTCCTCCCAACACTTATGCTTGTCCCAACAACCT...,MLKILLPTLMLVPTTWLAPTKWLWPTTLTHSLIIALVSLTWLKSPA...,ATGCTAAAAATCCTCCTCCCAACACTTATGCTTGTCCCAACAACCT...,1.0,MLKILLPTLMLVPTTWLAPTKWLWPTTLTHSLIIALVSLTWLKSPA...
