In [2]:

import dask
import dask.dataframe as dd
import config_readwrite as crw
import gzip
from itertools import product, combinations
import numpy as np

import os, sys

In [3]:
NULLOMER, MER, NMUTS, config_tag = "AACGATACGCGC","11", "2", "config.hs1"

config, cfn = crw.read_config(os.path.join(os.getcwd(), config_tag))
NULLS = config[f"{MER}mer"]["nullomers"]
KMERS = config[f"{MER}mer"]["kmers"]
RE= config[f"{MER}mer"]["results"]
PATH=config[f"{MER}mer"]["path"]
MER, NMUTS, = int(MER), int(NMUTS)

In [4]:
#df  = dd.read_csv(os.path.join(PATH, "ALL.*.11mers.csv"), header=None)
name = (os.path.join(PATH, "11mer.parquet"))
df = dd.read_parquet(name)
df.head()

Unnamed: 0,kmers,counts
0,AAAAGTAGTTT,6616
1,AAAAGTACGCA,351
2,AAAAGGGTAAG,2659
3,AAAATGTATGA,7354
4,AAAAAAATATC,7234


# functions 

In [5]:
@dask.delayed
def genKmers(length):
    """
    return all sequence permutations, including repeats (AAAAA, GGGGG, CCCCC etc.)
    
    require
        itertools.product
    
    input
        length (int)
    
    return
        mers_list (list) - list of all nucleotide permutations 
    """
    print("generating kmer space length", length)
    
    mers = product("ACTG", repeat=length)
    
    mers_list = list("".join(i) for i in mers)
    
    return mers_list

@dask.delayed
def getPosLetterCombos(nmuts, sequence):
    """
    return combinations of (1) indices (2) mutated bases for mutating a sequence
    
    require 
        itertools
        
    inputs 
        nmuts (int) - number mutations to make
        sequence (str) - sequence to be mutated
        
    method
        1. get index combinations based on sequence length and number of mutations to make. This makes a map of all possible combinations of sequences to mutate
            1.1 combinations requires that each index is unique and non-redundant. 
                Order does not matter - 
                    e.g. (2,4,5) is the same as (5,2,4) because indexes 2, 4, and 5, will all be mutated.
                    
        2. get sequence product to mutate at indexes
            2.1 - product allows for repeats of the same base in different positions
        
    return
        mut_pos (list) - list of positional combinations to mutate
        mut_bases (list) - list of letter combinations to mutate
        
    
    """
    #print("making index combinations, nucletide permutations of length", nmuts)
    
    #1 index combinations
    mut_pos = list(combinations(np.arange(len(sequence)), nmuts))
    
    #2 nucleotide permutations per index combo. 
    mut_bases = list(product("ACGT", repeat=nmuts))
    
    return mut_pos, mut_bases

@dask.delayed
def buildSeqMut(sequence, mut_pos, mut_bases):
    
    """
    mutate sequence at position with letter
    multiple positions and letters can be inserted into the sequence simultaneously. 

    input
        sequence (str) - original sequence
        mut_pos (set) - Sets of single, tuple, threeple positional index(es) to mutate
        mut_bases (tuple) - Sets of single, tuple, threeple nucleotide combinations to mutate sequence to

    method
        1. instantiate seqs set to collect mutated sequences, add identity to seq set
        2. per positions to mutate in set
            2.1 per base combinations to mutate at these positions
        3. zip and iterate through positions and bases, mutating input sequence
        4. IF mut_seq != input sequence, then return. Else, skip

            
    return 
        seqs (set) - set of sequences with mutations 
    
    """

    
    #1
    seqs, mut_seq = set(), ""
    
    
    #2
    for pos in mut_pos:
        
        #2.1
        for letters in mut_bases:
            
            #3
            for p, l in zip(pos, letters):

                if mut_seq =="":
                    mut_seq = sequence[:p] + l + sequence[p + 1:]
                    
                else:
                    mut_seq = mut_seq[:p] + l + mut_seq[p + 1:]
        
            #4
            if mut_seq != sequence:  
                seqs.add(mut_seq)
                mut_seq = ""


            else:
                #print('no mut', sequence, mut_seq
                mut_seq = ""
                pass
            
    seqs.add(sequence)
    return seqs

In [6]:
def generateMismatchSpectra(seq, nmuts, kmer_spectra):
    
    """
    input 
        seq (str) - sequence to mutate and match with kmer-spectra
        nmuts (int) - max number of mismatches to mutate each kmer sequence by
        kmer_spectra (dict) - dictionary of kmer keys and their frequency count (value)

    require
        getPosLetterCombos function
        buildSeqMut function
        prettifySeq function
        
    method
        1. instantiate kmer_match, null_match list.
        
        2. per nullomer seq in dictionary

        3. get all combinations of indexes and mutated bases to try. 
            mutate bases will be a single, tuple, or threeple depending on the number of mutations desired.
            
        4. mutate all positions of the nullomer with that base. 

            4.1 - get mutated sequences using buildSeqMut function
                NOTE - mutBase function will remove identity sequence. See function method step 4.3.
        5. Collect kmer distribution for each base mismatch. 
        6. per mutated sequence 
            6.1 look up kmer count of the mutated sequence. Test whether it is a prime. 
            6.2 add any resurfaced nullomer sequences to the resurface dict 
                - that is, resurfaced nullomers are sequences where mutations create nullomers
                - return this dictionary
            6.3 add kmer count to the dictionary as pretty key
        7. Add seq kmer dictionary back into the collection dictionary. 

    
    return
        kmer_match (list) - all mis-match sequences w/ n muts that are kmers 
        null_match (list) - all mis-match sequences w/ n muts that are NOT kmers (therefor nullomers) 
            every mismatch combination for each sequence 
            and per mismatch combo, the kmer counts for each mismatch

    """
    print("making kmer mismatch spectra w/ N mutatiosn =", nmuts)
    #1
    kmer_match, null_match = set(), set()


    #3
    mut_pos, mut_bases = getPosLetterCombos(nmuts, seq)

    #4
    seqs = buildSeqMut(seq, mut_pos, mut_bases)

    #5

    #6
    for mut_seq in seqs:

        # 6.1 - kmer frequency
        if (kmer_spectra.kmers == mut_seq).any():
            kmer_match.add(mut_seq)

        # 6.2 - nullomer after mutations, not in kmer-verse
        else:  
            null_match.add(mut_seq)  # append to list
            
    len(kmer_match), len(null_match), len(seqs)

    return  kmer_match, null_match, seqs

# main 

In [7]:
null = []
with gzip.open(NULLS, "rt") as reader:
    for l in reader:
        null.append(l.strip("\n").split(",")[0])

In [8]:
def match_nullomers(nullomer_list, nmuts):
    for seq in nullomer_list:
        kmer_match, null_match = set(), set()

        print(nmuts, seq)
        #3
        mut_pos, mut_bases = getPosLetterCombos(nmuts, seq)

        
        #4
        seqs = buildSeqMut(seq, mut_pos, mut_bases)


        #6
        for mut_seq in seqs:

            # 6.1 - kmer frequency
            if (kmer_spectra.kmers == mut_seq).any():
                kmer_match.add(mut_seq)

            # 6.2 - nullomer after mutations, not in kmer-verse
            else:  
                null_match.add(mut_seq)  # append to list

        print(len(kmer_match), len(null_match), len(seqs))
        
    return kmer_match


In [9]:
dask.compute(match_nullomers(null, NMUTS))

2 ACCGATACGCG


TypeError: Delayed objects of unspecified length are not iterable

In [None]:
generateMismatchSpectra = dask.delayed(generateMismatchSpectra)

In [None]:
for n in null:
    out = generateMismatchSpectra(n, int(NMUTS), df)

In [None]:
print(out)

In [None]:
z.visualize()

In [None]:
f.head()

In [None]:
## import dependencies
from time import sleep
## calculate square of a number

def calculate_square(x):
    sleep(1)
    x= x**2
    return x

## calculate sum of two numbers
def get_sum(a,b):
    sleep(1)
    return a+b

In [None]:
%%time
## call functions sequentially, one after the other

## calculate square of first number
x = calculate_square(10)

## calculate square of second number
y = calculate_square(20)

## calculate sum of two numbers
z = get_sum(x,y)
print(z)

In [None]:
## import dask dependencies
import dask
from dask import delayed

In [None]:
%%time
## Wrapping the function calls using dask.delayed
x = delayed(calculate_square)(10)
y = delayed(calculate_square)(20)
z = delayed(get_sum)(x, y)
print(z)