# Transform sequence to numpy array

Sequence comes as ACTG, must be transformed to array of [n, 4], where n is the length of sequence, and 4 is the number of possible nucleotides

In [1]:
import pandas as pd
import numpy as np

# Progress bar
from ipywidgets import FloatProgress
from IPython.display import display

In [2]:
def OneLetterToMatrix(matrix, letter):
    
    if letter == 'A' or letter == 'C' or letter == 'G' or letter =='T':
        mapper = {'A':0, 'C':1, 'G':2, 'T':3}
        index = mapper[letter]
        matrix[index] = 1
    else:
        matrix[:] = 0.25 # Unknown base
    return matrix


def DNASeqToMatrix(sequence):
    seq_length = len(sequence)
    matrix = np.zeros((seq_length, 4))
    
    i = 0
    while i < seq_length:
        letter = sequence[i]
        matrix_row = matrix[i, :]
        matrix[i, :] = OneLetterToMatrix(matrix_row, letter)
        i += 1
    
    return matrix


def SeriesSeqToMatrix(series):
    """
    Gets series of sequences, all must be of the same length Y. X is a number of elements in series.
    Creates numpy array (X, Y, 4), each column of 4 indicates the nucleotide.
    Columns are as following:
    A - 0
    C - 1
    G - 2
    T - 3
    """
    
    X = series.count()
    
    f = FloatProgress(min=0, max=X)
    display(f)
    
    length = series.str.len().max()
    
    arr = np.zeros((X, length, 4))
    
    i = 0
    while i < X:
        seq = series.iloc[i]
        seq_len = len(seq)
        arr[i, :seq_len, :] = DNASeqToMatrix(seq)
        i += 1
        f.value += 1
        
    return arr

In [3]:
store = pd.HDFStore('100000_random_50nt_sequences.h5')
df = store['data']
store.close()

In [4]:
df = df.reset_index()
df = df.drop('index', axis=1)

In [5]:
df.head()

Unnamed: 0,Bin,canonical
0,4,CATTCCGGAGAACAGACAGTCAACACGTTTACGGCCCTACGAATGG...
1,3,CTAGTCATCTTCTGGATTACTACGAATAAGGCAGGAACAGATACGG...
2,15,CTACAGTGTTCGCGAGCGCGTATAGAGATCGCTCAAACAAAGACCA...
3,9,ATCGAACCACATTTTCGAGCAAGTTAGTCGAATCGCCAAGTCAAAA...
4,13,GTACGGACTAATGTACTGGAGTAGATATAAGGATCCCGGGGCAAAA...


In [6]:
arr = SeriesSeqToMatrix(df.canonical)

In [7]:
arr

array([[[ 0.,  1.,  0.,  0.],
        [ 1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.],
        ..., 
        [ 0.,  0.,  0.,  1.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  0.,  1.,  0.]],

       [[ 0.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  1.],
        [ 1.,  0.,  0.,  0.],
        ..., 
        [ 0.,  1.,  0.,  0.],
        [ 1.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.]],

       [[ 0.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  1.],
        [ 1.,  0.,  0.,  0.],
        ..., 
        [ 1.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.]],

       ..., 
       [[ 1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  1.],
        ..., 
        [ 0.,  0.,  1.,  0.],
        [ 0.,  1.,  0.,  0.],
        [ 0.,  1.,  0.,  0.]],

       [[ 0.,  0.,  0.,  1.],
        [ 0.,  1.,  0.,  0.],
        [ 0.,  1.,  0.,  0.],
        ..., 
        [ 0.,  1.,  0.,  0.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  1.,  0.,  0.]],

       