In [1]:
import ast
import numpy as np
import time
import sys
import pandas as pd
import subprocess
import pickle
import math
from collections import Counter

In [2]:
# Calculate list of kmers from MYH11 and CBFB genes
with open('data/NM_022844.2.exons.fa') as f:
    seq = ''
    for line in f.readlines():
        if line[0] != '>':
            seq += line.strip()
k = 31
kmer_count = Counter()
print(len(seq))
for i in range(len(seq)-30):
    kmer_count[seq[i:i+k]] += 1
myh11_kmers = list(kmer_count.keys())

with open('data/NM_001755.3.exons.fa') as f:
    seq = ''
    for line in f.readlines():
        if line[0] != '>':
            seq += line.strip()
kmer_count = Counter()
print(len(seq))
for i in range(len(seq)-30):
    kmer_count[seq[i:i+k]] += 1
cbfb_kmers = list(kmer_count.keys())

6921
3134


In [3]:
set(myh11_kmers).intersection(set(cbfb_kmers))

set()

In [4]:
npm1_kmers = myh11_kmers + cbfb_kmers

In [5]:
len(npm1_kmers)

9995

In [6]:
# Read files for list of all patient ids
with open("data/patient_list.txt") as f:
    patient_list = f.read().split()

In [7]:
def kmc_to_str(kmc):
    # Converts kmer count 2d array into string of just the kmers, separated by spaces
    return str(kmc).replace(',', "").replace("'", '')[1:-1]
kms_str = kmc_to_str(npm1_kmers)

In [8]:
start_i = 0
num_kmers = len(npm1_kmers)
kmer_table = pd.DataFrame(np.zeros((len(patient_list), num_kmers+1)))

In [9]:
# Loading saved kmer_table
with open('data/temp_kmer_table.py', 'rb') as f:
    kmer_table = pickle.load(f)
start_i = int(kmer_table.iloc[:,-1].idxmax())
if int(kmer_table.iloc[-1,-1]) == -1:
    start_i = kmer_table.shape[0]

In [10]:
# Query all patients for specific kmers, but store in DataFrame
def query_np(patients, kms):
    global kmer_table
    global start_i
    global num_kmers
    i = start_i
    if i != 0:
        print(f"Restarting at patient #{i+1}")
    BUFFER_SIZE = 5
    buffer = np.zeros((BUFFER_SIZE, num_kmers+1))
    for p in patients[start_i:]:
        jf = f"/u/leucegene/data/sample/{p}/transcriptome/jellyfish/2.2.3/kmers.k31.CL2.jf"
        n = 0
        output = ""
        for n in range(math.ceil(num_kmers/100)):
            kms_subset = kmc_to_str(npm1_kmers[n*100:(n+1)*100])
            cmd = f"jellyfish query {jf} {kms_subset}"
            output = output + subprocess.getoutput(cmd) + '\n'
        j = 0
        for line in output[:-1].splitlines():
            km, cnt = line.split()
            buffer[i%BUFFER_SIZE,j] = int(cnt)
            j += 1
        buffer[i%BUFFER_SIZE,-1] = -1 # Indicates that this line has been queried
        i += 1
        timestamp = time.strftime("%H:%M:%S", time.localtime())
        print(f"\r{timestamp}: Queried patient #{i} ({p})...", end='')
        if i % BUFFER_SIZE == 0 or i == kmer_table.shape[0]:
            # Dump buffer
            buffer = np.delete(buffer, np.where(buffer[:,-1] == 0), axis=0)
            kmer_table.iloc[i-buffer.shape[0]:i] = buffer
            buffer = np.zeros((BUFFER_SIZE, num_kmers+1))
            print(f"Dumped buffer after patient #{i} ({p})!")
            start_i = i
        if i % (BUFFER_SIZE*2) == 0 or i == kmer_table.shape[0]:
            # Backup to disk
            with open('data/temp_kmer_table.py', 'wb') as f:
                pickle.dump(kmer_table, f)
            print(f"Backed up to disk after patient #{i} ({p})...")

In [11]:
# For each patient, extract kmer counts and put it in a DataFrame
query_np(patient_list, kms_str)
if int(kmer_table.iloc[-1,-1]) == -1:
    print("Done querying!")

Restarting at patient #221
10:51:29: Queried patient #225 (08H011)...Dumped buffer after patient #225 (08H011)!
10:55:48: Queried patient #230 (08H033)...Dumped buffer after patient #230 (08H033)!
Backed up to disk after patient #230 (08H033)...
11:00:10: Queried patient #235 (08H049)...Dumped buffer after patient #235 (08H049)!
11:04:10: Queried patient #240 (08H056)...Dumped buffer after patient #240 (08H056)!
Backed up to disk after patient #240 (08H056)...
11:08:53: Queried patient #245 (08H072)...Dumped buffer after patient #245 (08H072)!
11:14:05: Queried patient #250 (08H087)...Dumped buffer after patient #250 (08H087)!
Backed up to disk after patient #250 (08H087)...
11:18:44: Queried patient #255 (08H104)...Dumped buffer after patient #255 (08H104)!
11:23:32: Queried patient #260 (08H118)...Dumped buffer after patient #260 (08H118)!
Backed up to disk after patient #260 (08H118)...
11:27:56: Queried patient #265 (09H002)...Dumped buffer after patient #265 (09H002)!
11:32:12: Qu

In [18]:
# Processing & export finished data
kmer_table.columns = [f"km_{i}" for i in range(num_kmers)] + ["queried"]
export = kmer_table.drop("queried", axis=1)
with open('data/km_table_inv16.py', 'wb') as f:
    pickle.dump(export, f)

In [12]:
kmer_table.shape

(691, 9996)