In [8]:
import ast
import numpy as np
import time
import sys
import pandas as pd
import subprocess
import pickle
import math
from collections import Counter

In [9]:
# Calculate list npm1 kmers from sequence
with open('data/NM_001355006.1.exons.fa') as f:
    seq = ''
    for line in f.readlines():
        if line[0] != '>':
            seq += line.strip()
print(len(seq))
npm1_kmers = []
k = 31
kmer_count = Counter()
for i in range(len(seq)-30):
    kmer_count[seq[i:i+k]] += 1
npm1_kmers = list(kmer_count.keys())

1339


In [11]:
len(npm1_kmers)

1309

In [12]:
# Read files for list of all patient ids
with open("data/patient_list.txt") as f:
    patient_list = f.read().split()

In [13]:
def kmc_to_str(kmc):
    # Converts kmer count 2d array into string of just the kmers, separated by spaces
    return str(kmc).replace(',', "").replace("'", '')[1:-1]
kms_str = kmc_to_str(npm1_kmers)

In [14]:
start_i = 0
num_kmers = len(npm1_kmers)
kmer_table = pd.DataFrame(np.zeros((len(patient_list), num_kmers+1)))

In [None]:
# Loading saved kmer_table
with open('data/temp_kmer_table.py', 'rb') as f:
    kmer_table = pickle.load(f)
start_i = int(kmer_table.iloc[:,-1].idxmax())
if int(kmer_table.iloc[-1,-1]) == -1:
    start_i = kmer_table.shape[0]

In [15]:
# Query all patients for specific kmers, but store in DataFrame
def query_np(patients, kms):
    global kmer_table
    global start_i
    global num_kmers
    i = start_i
    if i != 0:
        print(f"Restarting at patient #{i+1}")
    BUFFER_SIZE = 5
    buffer = np.zeros((BUFFER_SIZE, num_kmers+1))
    for p in patients[start_i:]:
        jf = f"/u/leucegene/data/sample/{p}/transcriptome/jellyfish/2.2.3/kmers.k31.CL2.jf"
        n = 0
        output = ""
        for n in range(math.ceil(num_kmers/100)):
            kms_subset = kmc_to_str(npm1_kmers[n*100:(n+1)*100])
            cmd = f"jellyfish query {jf} {kms_subset}"
            output = output + subprocess.getoutput(cmd) + '\n'
        j = 0
        for line in output[:-1].splitlines():
            km, cnt = line.split()
            buffer[i%BUFFER_SIZE,j] = int(cnt)
            j += 1
        buffer[i%BUFFER_SIZE,-1] = -1 # Indicates that this line has been queried
        i += 1
        timestamp = time.strftime("%H:%M:%S", time.localtime())
        print(f"\r{timestamp}: Queried patient #{i} ({p})...", end='')
        if i % BUFFER_SIZE == 0 or i == kmer_table.shape[0]:
            # Dump buffer
            buffer = np.delete(buffer, np.where(buffer[:,-1] == 0), axis=0)
            kmer_table.iloc[i-buffer.shape[0]:i] = buffer
            buffer = np.zeros((BUFFER_SIZE, num_kmers+1))
            print(f"Dumped buffer after patient #{i} ({p})!")
            start_i = i
        if i % (BUFFER_SIZE*2) == 0 or i == kmer_table.shape[0]:
            # Backup to disk
            with open('data/temp_kmer_table.py', 'wb') as f:
                pickle.dump(kmer_table, f)
            print(f"Backed up to disk after patient #{i} ({p})...")

In [18]:
# For each patient, extract kmer counts and put it in a DataFrame
query_np(patient_list, kms_str)
if int(kmer_table.iloc[-1,-1]) == -1:
    print("Done querying!")

01:50:38: Queried patient #5 (02H017)...Dumped buffer after patient #5 (02H017)!
01:51:32: Queried patient #10 (02H043)...Dumped buffer after patient #10 (02H043)!
Backed up to disk after patient #10 (02H043)...
01:52:31: Queried patient #15 (02H080)...Dumped buffer after patient #15 (02H080)!
01:53:30: Queried patient #20 (03H028)...Dumped buffer after patient #20 (03H028)!
Backed up to disk after patient #20 (03H028)...
01:54:32: Queried patient #25 (03H049)...Dumped buffer after patient #25 (03H049)!
01:55:30: Queried patient #30 (03H070)...Dumped buffer after patient #30 (03H070)!
Backed up to disk after patient #30 (03H070)...
01:56:30: Queried patient #35 (03H094)...Dumped buffer after patient #35 (03H094)!
01:57:34: Queried patient #40 (03H116)...Dumped buffer after patient #40 (03H116)!
Backed up to disk after patient #40 (03H116)...
01:58:34: Queried patient #45 (04H017)...Dumped buffer after patient #45 (04H017)!
01:59:34: Queried patient #50 (04H039)...Dumped buffer after pa

In [19]:
# Processing & export finished data
kmer_table.columns = [f"km_{i}" for i in range(num_kmers)] + ["queried"]
export = kmer_table.drop("queried", axis=1)
with open('data/km_table_npm1.py', 'wb') as f:
    pickle.dump(export, f)

In [20]:
kmer_table.shape

(691, 1310)