In [300]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqUtils import CodonUsage
from Bio.SeqUtils import IUPACData

import random
import csv
import numpy as np
from collections import Counter

from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

from scipy.special import softmax
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [274]:
random.seed(0)

In [240]:
bases = "ATCG"
codons = [a + b + c for a in bases for b in bases for c in bases]
codon_dict = {}
for codon in codons:
    codon_dict[codon] = 0.0

In [368]:
backward_codon_dict = {}

for codon in codons:
    AA = str(Seq(codon).translate())
    backward_codon_dict[AA] = []

for codon in codons:
    AA = str(Seq(codon).translate())
    backward_codon_dict[AA].append(codon)

In [278]:
ecoli_train = np.loadtxt("../data/data_split/ecoli_heg_train.txt", dtype="str")
ecoli_test = np.loadtxt("../data/data_split/ecoli_heg_test.txt", dtype="str")

In [151]:
def seq_to_triplet(seq):
    ''' returns list of codons given NT sequence

        Args:
            str: NT sequence

        Returns:
            list: codons
    '''
    return [(seq[i:i+3]) for i in range(0, len(seq), 3)] 

def seqlist_to_triplets(seqs_NT):
    ''' returns list of list of codons given list of NT sequences

        Args:
            list(str): NT sequences

        Returns:
            list(list(str)): list of list of codons
    '''
    return [seq_to_triplet(seq) for seq in seqs_NT]

def NTlist_to_AA(seqs_NT):
    ''' returns list of AA sequences given list of NT sequences

        Args:
            list(str): NT sequences

        Returns:
            list(str): list of AA sequences
    '''
    return [str(Seq(seq).translate()) for seq in seqs_NT]

def pad_for_trigram(seqs):
    ''' returns padded list of list of sequences for trigrams

        Args:
            list(str): list of AA sequences

        Returns:
            list(list(char)): list of list of AA sequences, each padded left and right by 1 <s>, </s>
    '''
    padded_seqs = []
    for seq in seqs:
        padded_seq = list(pad_sequence(seq, pad_left=True, left_pad_symbol="<s>", 
                                    pad_right=True, right_pad_symbol="</s>", n=2))
        padded_seqs.append(padded_seq)
        
    return padded_seqs

def pad_for_fivegram(seqs):
    ''' returns padded list of list of sequences for fivegrams

        Args:
            list(str): list of AA sequences

        Returns:
            list(list(char)): list of list of AA sequences, each padded left and right by 2 <s>, </s>
    '''
    # we use adjusted_n = 2 for trigram, adjusted_n = 3 for 5 gram
    padded_seqs = []
    for seq in seqs:
        padded_seq = list(pad_sequence(seq, pad_left=True, left_pad_symbol="<s>", 
                                    pad_right=True, right_pad_symbol="</s>", n=3))
        padded_seqs.append(padded_seq)
        
    return padded_seqs

def unigram_dictionary(seq_list):
    ''' returns dictionary that maps unigram to most frequent codon

        Args:
            list(str): list of NT sequences

        Returns:
            dict(unigram:codon)
    '''
    seqs_AA = NTlist_to_AA(seq_list)
    seqs_codon = seqlist_to_triplets(seq_list)
    
    unigram_dict = {}
    
    for i in range(len(seqs_AA)):
        # this loop loops over the list of sequences
        seq_codon = seqs_codon[i]
        seq_AA_unigrams = list(ngrams(seqs_AA[i], n=1))
        for j in range(len(seq_AA_unigrams)):
            unigram = "".join(seq_AA_unigrams[j])
            if unigram in unigram_dict:
                unigram_dict[unigram].append(seq_codon[j])
            else:
                unigram_dict[unigram] = [seq_codon[j]]
    
    unigram_dict_frequency = {}
    
    for key in unigram_dict.keys():
        codon_counter = Counter(unigram_dict[key])
        total_count = 0
        for codon_key in codon_counter:
            total_count += codon_counter[codon_key]
        
        for codon_key in codon_counter:
            codon_counter[codon_key] = codon_counter[codon_key] / total_count

        unigram_dict_frequency[key] = codon_counter
    
    return unigram_dict_frequency

def trigram_dictionary(seq_list):
    seqs_AA = NTlist_to_AA(seq_list)
    seqs_codon = seqlist_to_triplets(seq_list)
    seqs_AA_padded = pad_for_trigram(seqs_AA)
    
    trigram_dict = {}
    
    for i in range(len(seqs_AA_padded)):
        # this loop loops over the list of sequences
        seq_codon = seqs_codon[i]
        seq_AA_trigrams = list(ngrams(seqs_AA_padded[i], n=3))
        for j in range(len(seq_AA_trigrams)):
            trigram = "".join(seq_AA_trigrams[j])
            if trigram in trigram_dict:
                trigram_dict[trigram].append(seq_codon[j])
            else:
                trigram_dict[trigram] = [seq_codon[j]]
    
    trigram_dict_frequency = {}
    
    for key in trigram_dict.keys():
        codon_counter = Counter(trigram_dict[key])
        total_count = 0
        for codon_key in codon_counter:
            total_count += codon_counter[codon_key]
        
        for codon_key in codon_counter:
            codon_counter[codon_key] = codon_counter[codon_key] / total_count

        trigram_dict_frequency[key] = codon_counter
    
    return trigram_dict_frequency


def fivegram_dictionary(seq_list):
    seqs_AA = NTlist_to_AA(seq_list)
    seqs_codon = seqlist_to_triplets(seq_list)
    seqs_AA_padded = pad_for_fivegram(seqs_AA)
    
    fivegram_dict = {}
    
    for i in range(len(seqs_AA_padded)):
        # this loop loops over the list of sequences
        seq_codon = seqs_codon[i]
        seq_AA_fivegrams = list(ngrams(seqs_AA_padded[i], n=5))
        for j in range(len(seq_AA_fivegrams)):
            fivegram = "".join(seq_AA_fivegrams[j])
            if fivegram in fivegram_dict:
                fivegram_dict[fivegram].append(seq_codon[j])
            else:
                fivegram_dict[fivegram] = [seq_codon[j]]
    
    fivegram_dict_frequency = {}
    
    for key in fivegram_dict.keys():
        codon_counter = Counter(fivegram_dict[key])
        total_count = 0
        for codon_key in codon_counter:
            total_count += codon_counter[codon_key]
        
        for codon_key in codon_counter:
            codon_counter[codon_key] = codon_counter[codon_key] / total_count

        fivegram_dict_frequency[key] = codon_counter
    
    return fivegram_dict_frequency

In [280]:
# split train into train/val to figure out the weights
ecoli_train_train, ecoli_train_test = train_test_split(ecoli_train, test_size=0.3)

In [310]:
# train frequency tables solely from the train set of train set
unigram_frequency_dict = unigram_dictionary(ecoli_train_train)
trigram_frequency_dict = trigram_dictionary(ecoli_train_train)
fivegram_frequency_dict = fivegram_dictionary(ecoli_train_train)

In [412]:
def predict_codons(AA_seq, backward_codon_dict, unigram_frequency_dict, trigram_frequency_dict, fivegram_frequency_dict, a=1.0, b=1.0, c=1.0):
    trigram_padded = list(pad_sequence(AA_seq, pad_left=True, left_pad_symbol="<s>", 
                                    pad_right=True, right_pad_symbol="</s>", n=2))
    
    fivegram_padded = list(pad_sequence(AA_seq, pad_left=True, left_pad_symbol="<s>", 
                                    pad_right=True, right_pad_symbol="</s>", n=3))
    
    unigrams = ["".join(ngram) for ngram in list(ngrams(AA_seq, n=1))]
    trigrams = ["".join(ngram) for ngram in list(ngrams(trigram_padded, n=3))]
    fivegrams = ["".join(ngram) for ngram in list(ngrams(fivegram_padded, n=5))]
    
    prediction_list = []
    
    for i in range(len(unigrams)): #loop over each AA in the sequence, generate a dictionary for each AA
        
        unigram = unigrams[i] #single amino acid at this position
        trigram = trigrams[i] #3 amino acids concatenated at this position
        fivegram = fivegrams[i] # 5 amino acids concatenated at this position
        prediction = {} #dict that maps possible codon (not all 20) to frequency
        
        for codon in backward_codon_dict[unigram]: #loop over possible codons given AA
            unigram_freq = 0.0
            trigram_freq = 0.0
            fivegram_freq = 0.0
            
            if unigram in unigram_frequency_dict and codon in unigram_frequency_dict[unigram]:
                unigram_freq = unigram_frequency_dict[unigram][codon]
                
            if trigram in trigram_frequency_dict and codon in trigram_frequency_dict[trigram]:
                trigram_freq = trigram_frequency_dict[trigram][codon]
            
            if fivegram in fivegram_frequency_dict and codon in fivegram_frequency_dict[fivegram]:
                fivegram_freq = fivegram_frequency_dict[fivegram][codon]
                
                
            prediction[codon] = a * unigram_freq + b * trigram_freq + c * fivegram_freq
            
        #softmax over possible codons    
        prediction_list.append(softmax(list(prediction.values())))
    
    return prediction_list

In [434]:
def calculate_loss(NT_seqs, backward_codon_dict, unigram_frequency_dict, trigram_frequency_dict, fivegram_frequency_dict, a=1.0, b=1.0, c=1.0):
    AA_seqs = NTlist_to_AA(NT_seqs)
    prediction_list = []
    correct_list = []
    loss = 0.0
    
    for AA_seq in AA_seqs:
        for prediction in predict_codons(AA_seq, backward_codon_dict, unigram_frequency_dict, trigram_frequency_dict, fivegram_frequency_dict, a, b, c):
            prediction_list.append(prediction)
            
    for NT_seq in NT_seqs:
        codon_seq = seq_to_triplet(NT_seq)
        for codon in codon_seq:
            AA = str(Seq(codon).translate())

            possible_codons = backward_codon_dict[AA]
            target_codon_dict = {}
            for possible_codon in possible_codons:
                target_codon_dict[possible_codon] = 0.0
                
            target_codon_dict[codon] = 1.0
            correct_list.append(list(target_codon_dict.values()))
    
#     print (prediction_list[:3])
    
    
    for i in range(len(correct_list)):
        print (correct_list[i])
        print (prediction_list[i])
        loss += log_loss(correct_list[i], prediction_list[i])
    
    return loss

In [435]:
loss = calculate_loss(ecoli_train_test, backward_codon_dict, unigram_frequency_dict, trigram_frequency_dict, fivegram_frequency_dict, a=1.0, b=1.0, c=1.0)

[1.0]
[1.]


ValueError: y_true contains only one label (1.0). Please provide the true labels explicitly through the labels argument.

In [None]:
min_loss = 10000000
curr_w1 = 0
curr_w2 = 0
curr_w3 = 0

for w1 in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    for w2 in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        loss = calculate_loss(ecoli_train_test, backward_codon_dict, unigram_frequency_dict, trigram_frequency_dict, fivegram_frequency_dict, a=w1, b=w2, c=1.0-w1-w2)
        if loss < min_loss:
            min_loss = loss
            curr_w1 = w1
            curr_w2 = w2
            curr_w3 = 1-w1-w2
        

In [438]:
trigram_frequency_dict['MEQ'].most_common()

[('GAG', 0.6666666666666666), ('GAA', 0.3333333333333333)]

In [441]:
def get_accuracy_trigram(ecoli_train, ecoli_test):
    unigram_frequency_dict = unigram_dictionary(ecoli_train)
    trigram_frequency_dict = trigram_dictionary(ecoli_train)
    
    AA_seqs = NTlist_to_AA(ecoli_test)
    AA_seqs_padded = pad_for_trigram(AA_seqs)
    codon_seqs = seqlist_to_triplets(ecoli_test)
    
    correct = 0
    total = 0
    
    for i in range(len(AA_seqs_padded)):
        AA_seq_trigrams = list(ngrams(AA_seqs_padded[i], n=3))
        for j in range(len(AA_seq_trigrams)):
            ngram_concat = "".join(AA_seq_trigrams[j])
            total += 1
            if ngram_concat in trigram_frequency_dict:
                pred = trigram_frequency_dict[ngram_concat].most_common()[0][0]
                if codon_seqs[i][j] == pred:
                    correct += 1
            else:
                pred = unigram_frequency_dict[AA_seqs_padded[i][j+1]].most_common()[0][0]
                if codon_seqs[i][j] == pred:
                    correct += 1
    
    return correct/total

In [447]:
def get_accuracy_fivegram(ecoli_train, ecoli_test):
    unigram_frequency_dict = unigram_dictionary(ecoli_train)
    fivegram_frequency_dict = fivegram_dictionary(ecoli_train)
    
    AA_seqs = NTlist_to_AA(ecoli_test)
    AA_seqs_padded = pad_for_fivegram(AA_seqs)
    codon_seqs = seqlist_to_triplets(ecoli_test)
    
    correct = 0
    total = 0
    
    for i in range(len(AA_seqs_padded)):
        AA_seq_fivegrams = list(ngrams(AA_seqs_padded[i], n=5))
        for j in range(len(AA_seq_fivegrams)):
            ngram_concat = "".join(AA_seq_fivegrams[j])
            total += 1
            if ngram_concat in fivegram_frequency_dict:
                pred = fivegram_frequency_dict[ngram_concat].most_common()[0][0]
                if codon_seqs[i][j] == pred:
                    correct += 1
            else:
                pred = unigram_frequency_dict[AA_seqs_padded[i][j+2]].most_common()[0][0]
                if codon_seqs[i][j] == pred:
                    correct += 1
    
    return correct/total

In [442]:
get_accuracy_trigram(ecoli_train, ecoli_test)

0.6347405216456037

In [448]:
get_accuracy_fivegram(ecoli_train, ecoli_test)

0.6232858295240656