# Sequence to one-hot vectors
This notebook converts all the fasta sequences into the input file into one-hot vectors.

In [61]:
import numpy as np
import pandas as pd
from Bio import SeqIO

sequence_path = '../processing/input_sequences/'
out_path = '../processing/sequences_one_hot/'
input_list_file = '../processing/input_list.txt'

def seq_to_one_hot(sequence):
    # the aa order is the same used in psiblast pssm
    aa_tuple = tuple('ARNDCQEGHILKMFPSTWYV')
    growing_arr = []
    for char in sequence:
        curr_row = [1 if char == aa else 0 for _,aa in enumerate(aa_tuple)]
        growing_arr.append(curr_row)
    one_hot_vec = np.array(growing_arr)
    assert one_hot_vec.sum(axis=1).all() == 1
    assert one_hot_vec.all() in (0,1)
    assert one_hot_vec.sum(axis=1).sum() == len(sequence)
    return one_hot_vec

def fasta_to_seq(fasta_filein):
    for i, record in enumerate(SeqIO.parse(fasta_filein, "fasta")):
        assert i == 0
        sequence = record.seq.upper()
    return sequence


with open(input_list_file) as handle:
    for line in handle:
        input_name = line.rstrip()
        sequence = fasta_to_seq(sequence_path + input_name + '.fasta')
        one_hot_vec = seq_to_one_hot(sequence)
        np.save(out_path + input_name + '.sequence_one_hot.npy', one_hot_vec)