In [84]:
############################################################################################

from sklearn.model_selection import train_test_split
def df_strat_split(
    df_input, stratify_colname = 'Species', 
    frac_train = 0.6, frac_val = 0.2, frac_test = 0.2, 
    random_state = None):
    '''
    Splits a Pandas dataframe into three subsets 
    (train, val, and test) following fractional ratios provided by the user, 
    where each subset is stratified by the values in a specific column 
    (each subset has the same relative frequency of the values in the column). 
    It performs this splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. 
        Usually this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into 
        train, val, and test data. (float fractions and sum = 1.0)
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split(). 
        (simply sets a seed to the random generator)

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    '''
    # -----------------------------------------------------------------------------------
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            'fractions {}, {}, {} do not add up to 1.0'.format(
                frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError(
            '{} is not a column in the dataframe'.format(
                stratify_colname))
    # -----------------------------------------------------------------------------------
    X = df_input                     # dontains all columns
    y = df_input[[stratify_colname]] # dataframe of just the column on which to stratify
    # split original dataframe into train and temp dataframes
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify = y, 
        test_size = (1.0 - frac_train), 
        random_state = random_state)
    # split the temp dataframe into val and test dataframes
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp, y_temp, stratify = y_temp, 
        test_size = relative_frac_test, 
        random_state = random_state)
    # -----------------------------------------------------------------------------------
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test
    
############################################################################################

# function to generate all possible k-mers
def all_kmers(k, alphabet = "ACGT"):
    return [''.join(chars) for chars in product(*(k*(alphabet,)))]
two_mers, for_mers, six_mers, eig_mers = all_kmers(2), all_kmers(4), all_kmers(6), all_kmers(8)

def allowed_k(segment, search=re.compile(r'^[AGTC]+$').search):
    return bool(search(segment))

def k_mer_encode(seq, k_mers, max_len=1600):
    k = len(k_mers[0])
    # cut sequence in k-mers
    chunks = [seq[i:i+k] for i in range(0, len(seq) - k+1) if allowed_k(seq[i:i+k])]
    # counting and regularizing all unique k-mers in the sequence
    segments, counts = np.unique(chunks, return_counts = True)
    counts = counts / np.amax(counts)
    # find and regularize the index of each k-mer
    indeces = np.where(np.isin(k_mers, segments))[0] / 4**k
    # returning 2D array with [k_mer index, count] with equal size for each k-mer
    arr = np.dstack((indeces, counts))[0]

    current_len = len(arr)
    tot_len = min(4**k, max_len)
    if current_len < tot_len:
        to_add = [[0.,0.] for i in range(tot_len - current_len)]
        arr = np.append(arr, to_add, axis = 0)
        
    return arr

############################################################################################

# Dictionary without consideration of mutation rate
one_hot_dict = {
    'A': [1.,0.,0.,0.], 'G':[0.,1.,0.,0.], 'T':[0.,0.,1.,0.], 'U':[0.,0.,1.,0.], 'C':[0.,0.,0.,1.], 
    'Y':[0.,0.,0.5,0.5], 'R':[0.5,0.5,0.,0.], 'W':[0.5,0.,0.5,0.], 'S':[0.,0.5,0.,0.5], 'K':[0.,0.5,0.5,0.], 'M':[0.5,0.,0.,0.5], 
    'D':[0.33,0.33,0.33,0.], 'V':[0.33,0.33,0.,0.33], 'H':[0.33,0.,0.33,0.33], 'B':[0.,0.33,0.33,0.33], 
    'X':[0.25,0.25,0.25,0.25], 'N':[0.25,0.25,0.25,0.25], '-':[0.,0.,0.,0.]
    }

def one_hot_seq(sequence, one_hot_dict=one_hot_dict, max_len=1600):
    # padding the sequences to a fixed length
	sequence += '-'*(max_len - len(sequence))
    # generating list of one-hot-lists using the dictionary
	onehot_encoded = [one_hot_dict[nucleotide] for nucleotide in sequence]
    # returning the list of lists as a numpy array
	return np.array(onehot_encoded)

In [29]:
import matplotlib.pyplot as plt
from itertools import product
import statistics
import numpy as np
import pandas as pd
import re

import tensorflow as tf
import pathlib
import os
import matplotlib.pyplot as plt

np.set_printoptions(precision=4)

In [100]:
df = pd.read_csv('data/df_noRC.csv')
df_sample = df.sample(1000)
df_sample.to_csv('test_df.csv', index = False)

In [101]:
two = df_sample['Sequence'].apply(lambda seq: k_mer_encode(seq, two_mers))
fur = df_sample['Sequence'].apply(lambda seq: k_mer_encode(seq, for_mers))
six = df_sample['Sequence'].apply(lambda seq: k_mer_encode(seq, six_mers))
eig = df_sample['Sequence'].apply(lambda seq: k_mer_encode(seq, eig_mers))

In [98]:
two

14694    [[0.0, 0.5620915032679739], [0.0625, 0.5098039...
56       [[0.0, 0.43452380952380953], [0.0625, 0.440476...
17408    [[0.0, 0.7655172413793103], [0.0625, 0.5931034...
13821    [[0.0, 0.8983050847457628], [0.0625, 0.7033898...
732      [[0.0, 0.4148936170212766], [0.0625, 0.3776595...
                               ...                        
8294     [[0.0, 0.6530612244897959], [0.0625, 0.5646258...
14224    [[0.0, 0.7333333333333333], [0.0625, 0.6074074...
21507    [[0.0, 0.6712328767123288], [0.0625, 0.5958904...
18069    [[0.0, 0.7482517482517482], [0.0625, 0.5734265...
864      [[0.0, 0.6174496644295302], [0.0625, 0.4899328...
Name: Sequence, Length: 100, dtype: object

In [99]:
one_hot_seq('ATCGATCGGGG')

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [66]:
4**6

4096

In [113]:
two.to_numpy()[0].tolist()[-500:]

[[0.0, 0.4968152866242038],
 [0.0625, 0.4267515923566879],
 [0.125, 0.5477707006369427],
 [0.1875, 0.267515923566879],
 [0.25, 0.4012738853503185],
 [0.3125, 0.535031847133758],
 [0.375, 0.732484076433121],
 [0.4375, 0.3821656050955414],
 [0.5, 0.5668789808917197],
 [0.5625, 0.732484076433121],
 [0.625, 1.0],
 [0.6875, 0.5414012738853503],
 [0.75, 0.27388535031847133],
 [0.8125, 0.3630573248407643],
 [0.875, 0.554140127388535],
 [0.9375, 0.25477707006369427]]

In [108]:
lens = []

for i in range(1000):
    lens.append(len(eig.to_numpy()[i]))

In [109]:
print(min(lens), max(lens), statistics.mean(lens))

1600 1600 1600


In [56]:
1424 / 8

178.0

In [55]:
print(min(lens), max(lens), statistics.mean(lens))

708 1188 1084.67


In [57]:
1188 /6

198.0

In [59]:
print(min(lens), max(lens), statistics.mean(lens))

181 256 247.55


In [60]:
256 / 4

64.0

In [24]:
np.array(eig.to_list())

  np.array(ten.to_list())


array([array([[1.8820e+03, 5.0000e-01],
              [3.7110e+03, 5.0000e-01],
              [3.9910e+03, 5.0000e-01],
              ...,
              [1.0447e+06, 5.0000e-01],
              [1.0452e+06, 5.0000e-01],
              [1.0453e+06, 5.0000e-01]]),
       array([[6.0200e+02, 5.0000e-01],
              [1.3780e+03, 5.0000e-01],
              [1.8560e+03, 5.0000e-01],
              ...,
              [1.0426e+06, 5.0000e-01],
              [1.0444e+06, 5.0000e-01],
              [1.0459e+06, 5.0000e-01]]),
       array([[1.1600e+02, 5.0000e-01],
              [4.6400e+02, 5.0000e-01],
              [1.8560e+03, 5.0000e-01],
              ...,
              [1.0423e+06, 5.0000e-01],
              [1.0467e+06, 5.0000e-01],
              [1.0467e+06, 5.0000e-01]]),
       array([[6.0200e+02, 5.0000e-01],
              [1.4740e+03, 5.0000e-01],
              [1.8560e+03, 5.0000e-01],
              ...,
              [1.0357e+06, 5.0000e-01],
              [1.0369e+06, 5.0000e-01]