In [None]:
############################################################################################

from sklearn.model_selection import train_test_split
def df_strat_split(
    df_input, stratify_colname = 'Species', 
    frac_train = 0.6, frac_val = 0.2, frac_test = 0.2, 
    random_state = None):
    '''
    Splits a Pandas dataframe into three subsets 
    (train, val, and test) following fractional ratios provided by the user, 
    where each subset is stratified by the values in a specific column 
    (each subset has the same relative frequency of the values in the column). 
    It performs this splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. 
        Usually this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into 
        train, val, and test data. (float fractions and sum = 1.0)
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split(). 
        (simply sets a seed to the random generator)

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    '''
    # -----------------------------------------------------------------------------------
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            'fractions {}, {}, {} do not add up to 1.0'.format(
                frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError(
            '{} is not a column in the dataframe'.format(
                stratify_colname))
    # -----------------------------------------------------------------------------------
    X = df_input                     # dontains all columns
    y = df_input[[stratify_colname]] # dataframe of just the column on which to stratify
    # split original dataframe into train and temp dataframes
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify = y, 
        test_size = (1.0 - frac_train), 
        random_state = random_state)
    # split the temp dataframe into val and test dataframes
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp, y_temp, stratify = y_temp, 
        test_size = relative_frac_test, 
        random_state = random_state)
    # -----------------------------------------------------------------------------------
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test
    
############################################################################################

In [None]:
from itertools import product
import numpy as np

# function to generate all possible k-mers
def all_kmers(k, alphabet = "ACGT"):
    return [''.join(chars) for chars in product(*(k*(alphabet,)))]
    
one_mers, two_mers, tri_mers, for_mers, fiv_mers = all_kmers(1), all_kmers(2), all_kmers(3), all_kmers(4), all_kmers(5)
six_mers, sev_mers, eig_mers, nin_mers, ten_mers = all_kmers(6), all_kmers(7), all_kmers(8), all_kmers(9), all_kmers(10)

In [None]:
import re

def allowed_k(segment, search=re.compile(r'^[AGTC]+$').search):
    return bool(search(segment))

def k_mer_encode(seq, k_mers):
    k = len(k_mers[0])
    # cut sequence in k-mers
    chunks = [seq[i:i+k] for i in range(0, len(seq) - k+1) if allowed_k(seq[i:i+k])]
    # counting all unique k-mers in the sequence and normalizing the count
    segments, counts = np.unique(chunks, return_counts = True)
    counts = counts / np.amax(counts)
    # returning 2D array with [k_mer index, count]
    return np.dstack((np.where(np.isin(k_mers, segments)), counts))[0]