In [None]:
import ast
import numpy as np
import time
import sys
import pandas as pd
import subprocess
import pickle
import math
from collections import Counter

### Loading preliminary data

In [None]:
# Loading flt3 kmers
with open('data/flt3_kmers', 'rb') as f:
    flt3_kmers = pickle.load(f)

In [None]:
# Calculate list of inv16 kmers from MYH11 and CBFB genes
with open('data/NM_022844.2.exons.fa') as f:
    seq = ''
    for line in f.readlines():
        if line[0] != '>':
            seq += line.strip()
k = 31
kmer_count = Counter()
print(len(seq))
for i in range(len(seq)-30):
    kmer_count[seq[i:i+k]] += 1
myh11_kmers = list(kmer_count.keys())

with open('data/NM_001755.3.exons.fa') as f:
    seq = ''
    for line in f.readlines():
        if line[0] != '>':
            seq += line.strip()
kmer_count = Counter()
print(len(seq))
for i in range(len(seq)-30):
    kmer_count[seq[i:i+k]] += 1
cbfb_kmers = list(kmer_count.keys())
inv16_kmers = myh11_kmers + cbfb_kmers

In [None]:
# Calculate list of npm1 kmers from sequence
with open('data/NM_001355006.1.exons.fa') as f:
    seq = ''
    for line in f.readlines():
        if line[0] != '>':
            seq += line.strip()
print(len(seq))
npm1_kmers = []
k = 31
kmer_count = Counter()
for i in range(len(seq)-30):
    kmer_count[seq[i:i+k]] += 1
npm1_kmers = list(kmer_count.keys())

In [None]:
# List of all patient IDs
with open("data/patient_list.txt") as f:
    patient_list = f.read().split()

### Kmer table preparation

In [None]:
def kmc_to_str(kmc):
    # Converts kmer count 2d array into string of just the kmers, separated by spaces
    return str(kmc).replace(',', "").replace("'", '')[1:-1]

In [None]:
# Query all patients for specific kmers, and store in DataFrame
def query_np(patients, kms):
    global kmer_table
    global start_i
    global num_kmers
    i = start_i
    if i != 0:
        print(f"Restarting at patient #{i+1}")
    BUFFER_SIZE = 5
    buffer = np.zeros((BUFFER_SIZE, num_kmers+1))
    for p in patients[start_i:]:
        jf = f"/u/leucegene/data/sample/{p}/transcriptome/jellyfish/2.2.3/kmers.k31.CL2.jf"
        n = 0
        output = ""
        for n in range(math.ceil(num_kmers/100)):
            kms_subset = kmc_to_str(kms[n*100:(n+1)*100])
            cmd = f"jellyfish query {jf} {kms_subset}"
            output = output + subprocess.getoutput(cmd) + '\n'
        j = 0
        for line in output[:-1].splitlines():
            km, cnt = line.split()
            buffer[i%BUFFER_SIZE,j] = int(cnt)
            j += 1
        buffer[i%BUFFER_SIZE,-1] = -1 # Indicates that this line has been queried
        i += 1
        timestamp = time.strftime("%H:%M:%S", time.localtime())
        print(f"\r{timestamp}: Queried patient #{i} ({p})...", end='')
        if i % BUFFER_SIZE == 0 or i == kmer_table.shape[0]:
            # Dump buffer
            buffer = np.delete(buffer, np.where(buffer[:,-1] == 0), axis=0)
            kmer_table.iloc[i-buffer.shape[0]:i] = buffer
            buffer = np.zeros((BUFFER_SIZE, num_kmers+1))
            print(f"Dumped buffer after patient #{i} ({p})!")
            start_i = i
        if i % (BUFFER_SIZE*2) == 0 or i == kmer_table.shape[0]:
            # Backup to disk
            with open('data/temp_kmer_table.py', 'wb') as f:
                pickle.dump(kmer_table, f)
            print(f"Backed up to disk after patient #{i} ({p})...")

In [None]:
# if data fetching was paused, resume by loading saved kmer_table
def resume():
    with open('data/temp_kmer_table.py', 'rb') as f:
        kmer_table = pickle.load(f)
    start_i = int(kmer_table.iloc[:,-1].idxmax())
    if int(kmer_table.iloc[-1,-1]) == -1:
        start_i = kmer_table.shape[0]

In [None]:
def fetch_data(kmers, output_name):
    global kmer_table
    global start_i
    global num_kmers

    start_i = 0
    num_kmers = len(kmers)
    kmer_table = pd.DataFrame(np.zeros((len(patient_list), num_kmers+1)))

    query_np(patient_list, kmers)
    if int(kmer_table.iloc[-1,-1]) == -1:
        print("Done querying!")

    # Processing & export finished data
    kmer_table.columns = [f"km_{i}" for i in range(num_kmers)] + ["queried"]
    export = kmer_table.drop("queried", axis=1)
    with open(f"data/{output_name}", 'wb') as f:
        pickle.dump(export, f)

### Fetching data into kmer tables

In [None]:
fetch_data(flt3_kmers, 'km_table_flt3.py')
fetch_data(inv16_kmers, 'km_table_inv16.py')
fetch_data(npm1_kmers, 'km_table_npm1.py')