# Transforming pd.Series of sequences to Scipy matrix of K-mers found in this sequence

## Generating dictionary of k-mers

See [how](Working_With_Sequences_K_Mers).

In [2]:
import itertools

In [3]:
def ProduceListOfAllPossibleKMers(length):
    return [''.join(x) for x in itertools.product('ACGT', repeat=length)]

def KMerListToDict(k_mer_list):
    kmer_index = range(0, len(k_mer_list), 1)
    return dict(zip(k_mer_list, kmer_index))

def ProduceDictOfAllPossibleKMers(length):
    kmer_list = ProduceListOfAllPossibleKMers(length)
    kmer_dict = KMerListToDict(kmer_list)
    return kmer_dict

In [4]:
# Producing dictionary of 6-mers, as an example:
hexamer_dict = ProduceDictOfAllPossibleKMers(6)

## Loading sequences

In [11]:
import pandas as pd
import numpy as np

In [96]:
# Opening sequences previously stored in h5 format
store = pd.HDFStore('100000_random_50nt_sequences.h5')
df = store['data']
store.close()

In [97]:
df.head()

Unnamed: 0,Bin,canonical
290826,14,TACTGATATACCTCTCCGTTAAGGATACGCCGTAAGCTCCATTGGT...
597807,10,CGAGCACGAATACTGGTTCACGAAGTAGGATTCCCATTTTAATCAG...
914453,11,AAATTAAGGCATACAGTAAACGACTTAAATCGAACACCGGTCTTGC...
377378,5,AAAGGGCAACTGTGGAAGCGGAGGGTAGACTGGTAAATAACATCGT...
1470839,9,CGCTAGATTTCACTGTGCTTTGGGTACGAAACCCAGCATATTCGCC...


In [98]:
# Current index is meaningless, I'm simply resetting it.
df = df.reset_index()
df = df.drop('index', axis=1)

In [99]:
df.head()

Unnamed: 0,Bin,canonical
0,14,TACTGATATACCTCTCCGTTAAGGATACGCCGTAAGCTCCATTGGT...
1,10,CGAGCACGAATACTGGTTCACGAAGTAGGATTCCCATTTTAATCAG...
2,11,AAATTAAGGCATACAGTAAACGACTTAAATCGAACACCGGTCTTGC...
3,5,AAAGGGCAACTGTGGAAGCGGAGGGTAGACTGGTAAATAACATCGT...
4,9,CGCTAGATTTCACTGTGCTTTGGGTACGAAACCCAGCATATTCGCC...


In [100]:
df.shape

(100000, 2)

There are 100 000 random 50 nt sequences in column "canonical". Column 'Bin' is a measured parameter in experiment. Here 'canonical' is our X, and 'Bin' is Y.

## Functions to transform single sequence to arrays of indices

Each k-mer occurrence is characterized by 3 values:
* k-mer index i - index from dictionary of k-mers, obtained above
* position p - position of the k-mer in sequence. For example, for sequence '290826', 6-mer TGATAT is at 4-th position
* value v - Always 1, this is a boolean sign k-mer is there.

In [41]:
def SeqToKMers(seq, k, kmer_dict):
    # Here seq - sequence made from A,C,G,T
    # k - desired k-mer length
    # kmer_dict - dictionary of all k-mers and their indices
    
    
    # Generating list of all positions in the sequence
    # From 0 to length of sequence
    all_p = range(0, len(seq))
    
    # Initializing lists of k-mer indices i, positions p and values v
    # They will be filled as we cycle through sequence positions
    i = []
    p = []
    v = []
    
    # Cycling through all positions in sequence
    for p_curr in all_p:
        # If we hit end of the sequence, or unexpected character, we will just skip the position
        try:
            # Getting a k-mer
            kmer = seq[p_curr:p_curr+k]
            # Getting a k-mer index from dictionary
            kmer_idx = kmer_dict[kmer]
            # Adding info into index lists:
            i.append(kmer_idx)
            p.append(p_curr)
            v.append(1)
        except:
            pass
    
    # Transforming resluting lists to numpy arrays
    i_arr = np.array(i)
    p_arr = np.array(p)
    v_arr = np.array(v)
    
    return i_arr, p_arr, v_arr

In [58]:
# Testing the function
i, p, v = SeqToKMers(df.iloc[0].canonical, 6, hexamer_dict)

In [59]:
i

array([3192,  483, 1932, 3635, 2252,  817, 3269,  791, 3165,  375, 1501,
       1909, 3542, 1883, 3439, 1468, 1776, 3010, 3850, 3112,  163,  652,
       2609, 2246,  793, 3173,  406, 1627, 2412, 1456, 1730, 2825, 3111,
        157,  629, 2516, 1875, 3407, 1342, 1274, 1003, 4013, 3764, 2769,
       2884])

In [38]:
df.iloc[0].canonical

'TACTGATATACCTCTCCGTTAAGGATACGCCGTAAGCTCCATTGGTCACA'

In [39]:
hexamer_dict['TACTGA'], hexamer_dict['ACTGAT']

(3192, 483)

In [40]:
%timeit SeqToKMers(df.iloc[0].canonical, 6, hexamer_dict)

10000 loops, best of 3: 180 µs per loop


## Transform Series of sequences to SciPy sparse matrices

In [43]:
from scipy.sparse import csr_matrix

In [47]:
# Progress bar
from ipywidgets import FloatProgress
from IPython.display import display

In [85]:
def SeqSeriesToKMersIndices(ss, k, kmer_dict):
    # ss - sequence series
    # k - k-mer size
    # kmer_dict - dictionary of k-mer indices
    
    # Initializing lists
    i_list = [] # k-mer index
    p_list = [] # position of k-mer in sequence
    v_list = [] # value (1 or 0)
    seq_idx_arr_list = [] # index of a sequence
    
    # Getting the list of all indices in series
    seq_index_list = ss.index
    
    # Initializing progress bar
    X = len(seq_index_list)
    f = FloatProgress(min=0, max=X)
    display(f)
    
    # Iterating through indices
    for seq_index in seq_index_list:
        # Getting sequnece
        seq = ss.loc[seq_index]
        # Getting k-mer occurrence arrays
        i, p, v = SeqToKMers(seq, k, kmer_dict)
        # Producing array of sequence index
        seq_idx_arr = np.empty(i.shape)
        seq_idx_arr.fill(seq_index)
        # Adding single arrays to the list of arrays for each sequence
        i_list.append(i)
        p_list.append(p)
        v_list.append(v)
        seq_idx_arr_list.append(seq_idx_arr)
        # Updating progress bar
        f.value += 1
    
    # Zero line to keep correct dimension of sparse matrix:
    # Maximum value of k-mer index:
    max_kmer_idx = max(kmer_dict.values())
    max_seq_index = max(seq_index_list)
    i_list.append([max_kmer_idx])
    p_list.append([0])
    v_list.append([0])
    seq_idx_arr_list.append([max_seq_index])
    
    #print len(i_list), len(p_list), len(v_list), len(seq_idx_arr_list)
    
    # Stacking all arrays lists into a single array
    i_arr_final = np.hstack(i_list)
    p_arr_final = np.hstack(p_list)
    v_arr_final = np.hstack(v_list)
    seq_idx_arr_final = np.hstack(seq_idx_arr_list)
    
    #print i_arr_final.shape, p_arr_final.shape, v_arr_final.shape, seq_idx_arr_final.shape
    
    # Transforming to sparse matrix
    # I am not using sequence postion information here.
    sparse = csr_matrix((v_arr_final, (seq_idx_arr_final, i_arr_final)))
    
    return sparse

In [101]:
sparse_matrix = SeqSeriesToKMersIndices(df.canonical, 6, hexamer_dict)

In [102]:
sparse_matrix.shape

(100000, 4096)

In [103]:
df.shape

(100000, 2)

In [105]:
sparse_matrix.min()

0

In [106]:
sparse_matrix.max()

7