In [2]:
import argparse
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.Seq import Seq
import glob
import gzip
from itertools import product
import numpy as np
import os, sys

from timeit import default_timer as timer
from functools import partial

In [3]:
arg_parser = argparse.ArgumentParser()

arg_parser.add_argument("directory", type=str, help='directory where chromosom.fa lives')
arg_parser.add_argument("length", type=int, help='kmer length')
arg_parser.add_argument("keysize", type=int, help='kmer key length for storing info')

args = arg_parser.parse_args()

PATH, WINDOW_SIZE, KEYSIZE = args.directory, args.length, args.keysize

usage: ipykernel_launcher.py [-h] directory length keysize
ipykernel_launcher.py: error: the following arguments are required: length, keysize


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
PATH, WINDOW_SIZE, KEYSIZE  =  "/wynton/home/ahituv/fongsl/dna/hg38/", 11, 4
path, window_size, keysize  = PATH, WINDOW_SIZE, KEYSIZE
OUTDIR = os.path.join(PATH, "kmers", f"{WINDOW_SIZE}mers")

In [5]:
OUTDIR = os.path.join(PATH, "kmers", f"{WINDOW_SIZE}mers")

print(OUTDIR)
# FUNCTIONS


def makeChrList():
    """
    return list of chromosome numbers
    """
    chr_list = list(np.arange(1,23))
    chr_list.extend(["X", "Y"])
    
    return chr_list


def makeKeys(keysize):
    """
    input
        keysize (int) - length of kmer-keys to create
    
    require
        itertools.product
        python list comprehension

    method
        itertools.product kmers -> join kmers into str -> make list


    return
        key_set (set) - list of strings. 
    """ 
    key_set = set()
    for item in product("ACGT", repeat=keysize):
        key_set.add("".join(item))
        
    return key_set

/wynton/home/ahituv/fongsl/dna/hg38/kmers/11mers


In [8]:
def countKeymerSpace(windowsize, path, key):
    """
    input
        universe_kmer - dictionary with the kmer universe    
        windowsize(int) - size of the kmer universe
        path(str) - path to dataframes
        key (str) - sequence key that kmers are split on

    method
        1. per key, 
            1.1 calculate the length of the value sequences 
            1.2 get set of all possible value sequences 
            1.3 make a new dictionary of value sequences associated with that key

        2. get all files with that key across chromosomes
        3. per file, read sub-kmer (i.e. kmer = key + sub-kmer) counts (val) associated with that key

        4. if key is in keymer universe, than sum add the new count to the sum. 

        6. write universe_key dictionary to file. 
        7. write the key to the chrlog
        8. delete chr based files


    """

    #1.1
    value_seq_len = windowsize - len(key) # get length of value sequence (wo key)
    
    #1.2
    value_seqs = makeKeys(value_seq_len)  # get set of all value sequence combos (len = windowsize-key) 
    
    #1.3
    key_universe = dict.fromkeys(value_seqs, 0)  # dict to sum sequence results across chromosomes

    #2
    query = os.path.join(path, f"*.{windowsize}mers.{key}.csv")
    FS = glob.glob(query)
    
    #3
    for F in FS:
        start = timer()
        
        with open(F, "r") as reader:
            for line in reader.readlines():
                start = timer()
                val_seq, count = line.strip("\n").split(",")
                
                #4
                key_universe[val_seq] +=int(count)
               
    
    #6
    writeDictionary(f"ALL", key, path, windowsize, key_universe)
    
    #7
    writeChrLog(key, path)
    
    #8 
    #os.system(f"rm {query}")
    
    
def writeDictionary(name, key, path, windowsize, result_dict):
    
    """
    write kmer universe dictionary to outfile, write nullomers
    
    input
        name (str) - chr number
        path (str) - path to write outfile
        windowsize (int) - size of window used for kmers
        results_dict (dictionary) - kmer universe w/ counts of kmer occurrences 
    
    method
        1. create the outfile
        2. write key, value as comma-separated str to EITHER nullomer or kmer outfile
        3. close the outfiles
        4. delete empty files
    
    return 
        out_file (str) - written file name
    
    """
    
    #1
    kmer_file = os.path.join(path, f"{name}.{key}.{windowsize}mers.csv")
    null_file = os.path.join(path, f"{name}.{key}.{windowsize}mers-nullomers.csv")
    
    
    #2
    nullomer = open(null_file, "w")
    with open(kmer_file, "w") as kmer:
        
        for value_seq, count in result_dict.items():
            seq = key+value_seq # put the full sequence back together
            
            if int(count) > 0: # if nullomer
                kmer.write(f"{seq},{count}\n")
            else:
                nullomer.write(f"{seq},{count}\n")
    #3
    kmer.close(), nullomer.close()
    
    #4

    #print("\n\nwrote", kmer_file)
    
    return kmer_file, null_file



    

In [7]:
def readChrLog(path):
    """
    read chr.log and determine whether chromosome has already been summed into file dataset
    
   input
        chr_num (str) - chromosome number
        path (str) - path to directory to write log
        
    method
        1. make the log file
            1.1 if log file does not exist, return False (none of the chromosomes have been run) 
        2. open existing log file. 
        3. append chr_num to list
        4. check whether input chr_num is in list:
            4.1 if yes, return True (chromosome has been run)
            4.2 if no, return False (chromos dome has not been run)
    
    return
        empty set
        chr_runlist (set) - if key has been added to final nullomer count
    """
    
    #1
    out = os.path.join(path, "chr.log")

    #2
    runlist=set()
    if os.path.exists(out) is True:
    
        with open(out, "r") as chrlog:
        #3
            for line in chrlog.readlines():
                runlist.add(str(line.split("\n")[0]))
        
        
    return runlist
       
            
            
def writeChrLog(chr_num, path):
    """
    write log of chrs written to the all mer
    
    input
        chr_num (str) - chromosome number
        path (str) - path to directory to write log
        
    method
        1. make the log file
        2. open the log file
        3. append the chr_num to the log file
        4. close the file
    """
    #1
    out = os.path.join(path, "chr.log")
    
    #2
    with open(out, "a") as chrlog:
        
        #3
        chrlog.write(f"{chr_num}\n")
    
    #4
    chrlog.close()

In [None]:
    
# MAIN

def main(argv):
    """
    count number of kmer occurrences per chromosome, write output to file
    
    method
 
        1. read log, check if you've run any keys already
        2. get set of keys to read. 
        3. if key has been run (set), take difference of key_set from run_already set. 
            Else, take difference from empty set (i.e. difference is nothing and full kmer_key set gets run) 
        4. for each kmer key, 
            4.1 Use partial function to run kmer keys separately in parallel as a pool
            4.2 Update kmer-key universe dict values w/ sum the number of instances across chromosomes
            4.3 Write the universe dictionary for every kmer key. 
            4.4 Write kmer key to log file (this line is inside the countKeymerSpace function). 
                Thus, only when kmer key is complete will universe be written and chrlog be updated. 
        5. concatenate nullomer, kmer files
    """
    

In [9]:
    #1
    run_already = readChrLog(OUTDIR) # check whether key has been run already

    #2 get set of kmer keyes based on keysize 
    key_set = makeKeys(KEYSIZE)

    #3
    key_set = key_set.difference(run_already)  # remove run already set. 

In [10]:
key_set

{'AAAA',
 'AAAC',
 'AAAG',
 'AAAT',
 'AACA',
 'AACC',
 'AACG',
 'AACT',
 'AAGA',
 'AAGC',
 'AAGG',
 'AAGT',
 'AATA',
 'AATC',
 'AATG',
 'AATT',
 'ACAA',
 'ACAC',
 'ACAG',
 'ACAT',
 'ACCA',
 'ACCC',
 'ACCG',
 'ACCT',
 'ACGA',
 'ACGC',
 'ACGG',
 'ACGT',
 'ACTA',
 'ACTC',
 'ACTG',
 'ACTT',
 'AGAA',
 'AGAC',
 'AGAG',
 'AGAT',
 'AGCA',
 'AGCC',
 'AGCG',
 'AGCT',
 'AGGA',
 'AGGC',
 'AGGG',
 'AGGT',
 'AGTA',
 'AGTC',
 'AGTG',
 'AGTT',
 'ATAA',
 'ATAC',
 'ATAG',
 'ATAT',
 'ATCA',
 'ATCC',
 'ATCG',
 'ATCT',
 'ATGA',
 'ATGC',
 'ATGG',
 'ATGT',
 'ATTA',
 'ATTC',
 'ATTG',
 'ATTT',
 'CAAA',
 'CAAC',
 'CAAG',
 'CAAT',
 'CACA',
 'CACC',
 'CACG',
 'CACT',
 'CAGA',
 'CAGC',
 'CAGG',
 'CAGT',
 'CATA',
 'CATC',
 'CATG',
 'CATT',
 'CCAA',
 'CCAC',
 'CCAG',
 'CCAT',
 'CCCA',
 'CCCC',
 'CCCG',
 'CCCT',
 'CCGA',
 'CCGC',
 'CCGG',
 'CCGT',
 'CCTA',
 'CCTC',
 'CCTG',
 'CCTT',
 'CGAA',
 'CGAC',
 'CGAG',
 'CGAT',
 'CGCA',
 'CGCC',
 'CGCG',
 'CGCT',
 'CGGA',
 'CGGC',
 'CGGG',
 'CGGT',
 'CGTA',
 'CGTC',
 'CGTG',
 

In [11]:
    partial_keymerSpace = partial(countKeymerSpace, WINDOW_SIZE, OUTDIR)

    #4 run partial to write kmer dictionaries summed across sequences. 
    [partial_keymerSpace(key) for key in key_set]
    

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [13]:
    os.chdir(OUTDIR)

    # concat KMERS
    os.system(f"cat ALL.*.{WINDOW_SIZE}mers.csv > ALL.{WINDOW_SIZE}mers-kmers.csv && gzip ALL.{WINDOW_SIZE}mers-kmers.csv")

    # concat Nullomers
    os.system(f"cat ALL.*.{WINDOW_SIZE}mers-nullomers.csv > ALL.{WINDOW_SIZE}mers-nullomers.csv && gzip ALL.{WINDOW_SIZE}mers-nullomers.csv")
    
    # delete supporting files. 
    #os.system(f"gzip ALL.*.{WINDOW_SIZE}mers.csv")
    #os.system(f"gzip ALL.*.{WINDOW_SIZE}mers-nullomers.csv")

0

In [12]:
if __name__ == '__main__':
    main(sys.argv[1:])

IndentationError: expected an indented block after 'if' statement on line 15 (2814680114.py, line 16)