In [2]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqUtils import CodonUsage
from Bio.SeqUtils import IUPACData
import torch
import torchtext
from namedtensor import ntorch
from namedtensor.text import NamedField
from sklearn.ensemble import RandomForestClassifier
from nltk.util import ngrams
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re
# from google.colab import files
import pandas as pd
import math
import random
# Use CPU for baseline...
device = torch.device("cpu") #torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
from collections import defaultdict

In [3]:
# Our input $x$
# Prepend with a start token
tokenize = lambda x : ["<START>"] + re.findall('.{%d}' % 3, x)
TEXT = NamedField(names=("seqlen", ), sequential=True, 
                  lower=True, tokenize=tokenize)
AA_LABEL = NamedField(names=("seqlen", ), 
                  lower=True)

In [13]:
file_name = "data/ecoli.heg.fasta"

In [14]:
sequences = [len(str(rec.seq)) % 3 for rec in SeqIO.parse(file_name, "fasta")]

In [15]:
sequences = [str(rec.seq) for rec in SeqIO.parse(file_name, "fasta")]
df = pd.DataFrame(sequences, columns=["sequence"])
df = df.sample(frac=1, random_state=1)
# df = df[:500]
df.to_csv("cds.csv", index=False, header=False)

In [16]:
seqs = [str(rec.seq) for rec in SeqIO.parse(file_name, "fasta")]

In [17]:
num_bases = 0
print(len(df))
for i,j in df.iterrows():
    num_bases += len(j.sequence)
print("Bases, codons:" , num_bases, num_bases / 3)

253
Bases, codons: 264243 88081.0


In [18]:
sequences_ = [rec.seq for rec in SeqIO.parse(file_name, "fasta") if len(rec.seq) % 3 == 0]
tokenize(str(sequences_[0][:21*3]))

['<START>',
 'ATG',
 'AAA',
 'CGC',
 'ATT',
 'AGC',
 'ACC',
 'ACC',
 'ATT',
 'ACC',
 'ACC',
 'ACC',
 'ATC',
 'ACC',
 'ATT',
 'ACC',
 'ACA',
 'GGT',
 'AAC',
 'GGT',
 'GCG',
 'GGC']

In [19]:
bases = "tcag"
codons = [a + b + c for a in bases for b in bases for c in bases]
aa = [str(Seq(j).translate()) for j in codons]
codon_to_aa = dict(zip(codons, aa))


# #create one hot of AA encoding
AA_LABEL.build_vocab(aa)

In [20]:
codon_to_aa

{'ttt': 'F',
 'ttc': 'F',
 'tta': 'L',
 'ttg': 'L',
 'tct': 'S',
 'tcc': 'S',
 'tca': 'S',
 'tcg': 'S',
 'tat': 'Y',
 'tac': 'Y',
 'taa': '*',
 'tag': '*',
 'tgt': 'C',
 'tgc': 'C',
 'tga': '*',
 'tgg': 'W',
 'ctt': 'L',
 'ctc': 'L',
 'cta': 'L',
 'ctg': 'L',
 'cct': 'P',
 'ccc': 'P',
 'cca': 'P',
 'ccg': 'P',
 'cat': 'H',
 'cac': 'H',
 'caa': 'Q',
 'cag': 'Q',
 'cgt': 'R',
 'cgc': 'R',
 'cga': 'R',
 'cgg': 'R',
 'att': 'I',
 'atc': 'I',
 'ata': 'I',
 'atg': 'M',
 'act': 'T',
 'acc': 'T',
 'aca': 'T',
 'acg': 'T',
 'aat': 'N',
 'aac': 'N',
 'aaa': 'K',
 'aag': 'K',
 'agt': 'S',
 'agc': 'S',
 'aga': 'R',
 'agg': 'R',
 'gtt': 'V',
 'gtc': 'V',
 'gta': 'V',
 'gtg': 'V',
 'gct': 'A',
 'gcc': 'A',
 'gca': 'A',
 'gcg': 'A',
 'gat': 'D',
 'gac': 'D',
 'gaa': 'E',
 'gag': 'E',
 'ggt': 'G',
 'ggc': 'G',
 'gga': 'G',
 'ggg': 'G'}

In [21]:
my_data = torchtext.data.TabularDataset("cds.csv", format="CSV", 
                                        fields=[("sequence", TEXT)])

In [22]:
# data is already shuffled, we seed that and guarantee this stays the same
import random
random.seed(0)
train, test = my_data.split(split_ratio=0.8, random_state=random.getstate())

In [50]:
train.fields['sequence'].

('seqlen',)

In [53]:
TEXT.build_vocab(train)

In [56]:
from torchtext.data import Iterator, BucketIterator
 
#we don't split genes
train_iter_bucket, test_iter_bucket = torchtext.data.BucketIterator.splits(
    (train, test), batch_sizes=(10,10), sort_within_batch=False, sort_key=lambda x : len(x.sequence),
    device=torch.device(device, ))

In [57]:
def output_list_of_res(res, outfile = "test.txt"):
    ''' Output nucleotides to file; one gene per line'''
    with open(outfile, "w") as fp: 
        for batch in res: 
            for index in range(batch.shape["batch"]): 
                new_seq = translate_to_seq(batch[{"batch": index}])
                fp.write(new_seq + "\n")

In [62]:
def translate_to_seq(x): 
    ''' Takes in single tensor of name seqlen'''
    my_str = "".join([TEXT.vocab.itos[i] for i in x.values])
    my_str = my_str.split("<pad>")[0]
    my_str = my_str.split("<unk>")[0]

    if "<start>" in my_str: 
        my_str = my_str.split("<start>")[1]
    
    return my_str.upper()

def output_iterator_to_file(iter_, outfile="iterator_output.txt"): 
    ''' Output nucleotides to file; one gene per line'''
    with open(outfile, "w") as fp: 
        for batch in iter_: 
            for index in range(batch.sequence.shape["batch"]): 
                new_seq = translate_to_seq(batch.sequence[{"batch": index}])
                fp.write(new_seq + "\n")

In [63]:
output_iterator_to_file(train_iter_bucket, "ecoli_heg_train.txt")

In [64]:
output_iterator_to_file(test_iter_bucket, "ecoli_heg_test.txt")