In [1]:
import ast
import numpy as np
import time
import sys
import pandas as pd
import subprocess
import pickle
import math

In [None]:
with open('data/flt3_kmers', 'rb') as f:
    flt3_kmers = pickle.load(f)
print(flt3_kmers)

In [None]:
# Read files for list of all patient ids and list of kmers
with open("data/patient_list.txt") as f:
    patient_list = f.read().split()

with open('data/flt3_kmers', 'rb') as f:
    flt3_kmers = pickle.load(f)

In [3]:
def kmc_to_str(kmc):
    # Converts kmer count 2d array into string of just the kmers, separated by spaces
    return str(kmc).replace(',', "").replace("'", '')[1:-1]
kms_str = kmc_to_str(flt3_kmers)

In [4]:
start_i = 0
num_kmers = len(flt3_kmers)
kmer_table = pd.DataFrame(np.zeros((len(patient_list), num_kmers+1)))

In [11]:
# Loading saved kmer_table
with open('data/temp_kmer_table.py', 'rb') as f:
    kmer_table = pickle.load(f)
start_i = int(kmer_table.iloc[:,-1].idxmax())
if int(kmer_table.iloc[-1,-1]) == -1:
    start_i = kmer_table.shape[0]

In [6]:
# Query all patients for specific kmers, but store in DataFrame
def query_np(patients, kms):
    global kmer_table
    global start_i
    global num_kmers
    i = start_i
    if i != 0:
        print(f"Restarting at patient #{i+1}")
    BUFFER_SIZE = 5
    buffer = np.zeros((BUFFER_SIZE, num_kmers+1))
    for p in patients[start_i:]:
        jf = f"/u/leucegene/data/sample/{p}/transcriptome/jellyfish/2.2.3/kmers.k31.CL2.jf"
        n = 0
        output = ""
        for n in range(math.ceil(num_kmers/100)):
            kms_subset = kmc_to_str(flt3_kmers[n*100:(n+1)*100])
            cmd = f"jellyfish query {jf} {kms_subset}"
            output = output + subprocess.getoutput(cmd) + '\n'
        j = 0
        for line in output[:-1].splitlines():
            km, cnt = line.split()
            buffer[i%BUFFER_SIZE,j] = int(cnt)
            j += 1
        buffer[i%BUFFER_SIZE,-1] = -1 # Indicates that this line has been queried
        i += 1
        timestamp = time.strftime("%H:%M:%S", time.localtime())
        print(f"{timestamp}: Queried patient #{i} ({p})...")
        if i % BUFFER_SIZE == 0 or i == kmer_table.shape[0]:
            # Dump buffer
            buffer = np.delete(buffer, np.where(buffer[:,-1] == 0), axis=0)
            kmer_table.iloc[i-buffer.shape[0]:i] = buffer
            buffer = np.zeros((BUFFER_SIZE, num_kmers+1))
            print(f"Dumped buffer after patient #{i} ({p})!")
            start_i = i
        if i % (BUFFER_SIZE*2) == 0 or i == kmer_table.shape[0]:
            # Backup to disk
            with open('data/temp_kmer_table.py', 'wb') as f:
                pickle.dump(kmer_table, f)
            print(f"Backed up to disk after patient #{i} ({p})...")

In [13]:
# For each patient, extract kmer counts and put it in a DataFrame
query_np(patient_list, kms_str)
if int(kmer_table.iloc[-1,-1]) == -1:
    print("Done querying!")

Restarting at patient #692
Done querying!


In [16]:
# Processing & export finished data
kmer_table.columns = [f"km_{i}" for i in range(num_kmers)] + ["queried"]
export = kmer_table.drop("queried", axis=1)
with open('data/km_table_1.py', 'wb') as f:
    pickle.dump(export, f)

In [18]:
kmer_table.shape

(691, 3797)