In [1]:
# only add shingles that don't contain stop codon

In [2]:
import os
import pickle
import numpy as np
import multiprocessing as mp
from functools import partial
from itertools import tee
from itertools import islice
# import matplotlib.pyplot as plt
# import lightgbm as lgb

from sklearn.model_selection import train_test_split

In [3]:
data_dir = r'/ccb/salz4-4/markus/shimmer/data'

# CDS_path = os.path.join(data_dir, 'CDS_3600.pkl')
# nonprot_path = os.path.join(data_dir, 'nonprot_3600.pkl')
# CDS_path = os.path.join(data_dir, 'CDS_singlestrain.pkl')
# nonprot_path = os.path.join(data_dir, 'nonprot_singlestrain.pkl')
CDS_path = os.path.join(data_dir, 'CDS_singlestrain_threeprime.pkl')
nonprot_path = os.path.join(data_dir, 'nonprot_singlestrain_threeprime.pkl')

In [4]:
# load preprocessed data
with open(CDS_path, 'rb') as f:
    X_CDS = pickle.load(f)
with open(nonprot_path, 'rb') as f:
    X_nonprot = pickle.load(f)

In [5]:
# measure X length
gene_length_CDS = np.asarray([len(x) for x in X_CDS])
gene_length_nonprot = np.asarray([len(x) for x in X_nonprot])

# max_gene_length = np.max(gene_length_CDS)

In [6]:
# minimum gene length must be greater than shingle size
max_gene_length = 10000 # may be good to avoid ultra long genes
min_gene_length = 11

X_CDS_filtered = [x for x in X_CDS if len(x)<=max_gene_length and len(x)>=min_gene_length]
X_nonprot_filtered = [x for x in X_nonprot if len(x)<=max_gene_length and len(x)>=min_gene_length]

In [7]:
print(len(X_CDS_filtered))
print(len(X_nonprot_filtered))

9304271
46521351


In [8]:
# encode amino acids as ints
# seems heavily IO limited, room for improvement here
n_threads = 36

aa_to_int = {'F':1,
             'L':2,
             'I':3,
             'M':4,
             'V':5,
             'S':6,
             'P':7,
             'T':8,
             'A':9,
             'Y':10,
             'H':11,
             'Q':12,
             'N':13,
             'K':14,
             'D':15,
             'E':16,
             'C':17,
             'W':18,
             'R':19,
             'G':20,
             'X':21,
             'B':22,
             'J':23,
             'Z':24,
             '*':31}

def makeint(seq):
    return [aa_to_int[x] for x in seq]

# n_genes = 1000

p = mp.Pool(n_threads)
# X_CDS_int = p.map(makeint, X_CDS_filtered[:n_genes])
X_CDS_int = p.map(makeint, X_CDS_filtered)
p.close()
p.join()

p = mp.Pool(n_threads)
# X_nonprot_int = p.map(makeint, X_nonprot_filtered[:n_genes])
X_nonprot_int = p.map(makeint, X_nonprot_filtered)
p.close()
p.join()

In [9]:
# cast lists to arrays
# also limited, is the speed improvement really worth it later? maybe better just to stay in lists
X_CDS_int_array = [np.asarray(x, dtype=int) for x in X_CDS_int]
X_nonprot_int_array = [np.asarray(x, dtype=int) for x in X_nonprot_int]

In [10]:
def pad(a, length):
    arr = np.zeros(length, dtype=int)
    arr[:len(a)] = a
    return arr

def window(it, size):
    # http://justanr.blogspot.com/2014/08/implementing-sliding-windows-in-python.html
    yield from zip(*[islice(it, s, None) for s, it in enumerate(tee(it, size))])

def shingle(window_length, step_size, int_array):
    """get overlapping windows of the same size, padding final window with 0s to make shape work"""
    remainder = (len(int_array)-window_length)%step_size
    divisible_length = len(int_array) + step_size - remainder
    
    padded = pad(int_array, divisible_length)
    shingled = np.asarray(list(window(padded, window_length)), dtype=int)[::step_size]
    
    stopidx = np.where(shingled == 31)[0]
    idx = set(range(len(shingled)))
    idx -= set(stopidx)
    nostop_idx = np.asarray(list(idx), dtype=int)
    shingled_nostop = shingled[nostop_idx] 
    
    return shingled_nostop

In [14]:
print(len(X_CDS_int_array))
print(len(X_nonprot_int_array))

9304271
46521351


In [18]:
X_CDS_int_array_path = os.path.join(data_dir, 'X_CDS_int_array.npy')
np.save(X_CDS_int_array_path, X_CDS_int_array)

X_nonprot_int_array_path = os.path.join(data_dir, 'X_nonprot_int_array.npy')
np.save(X_nonprot_int_array_path, X_nonprot_int_array)

KeyboardInterrupt: 

In [21]:
del X_nonprot_int
del X_nonprot
del X_nonprot_filtered

In [22]:
X_nonprot_int_array_path = os.path.join(data_dir, 'X_nonprot_int_array.npy')
np.save(X_nonprot_int_array_path, X_nonprot_int_array)

In [None]:
a = [1]

In [16]:
p = mp.Pool(1)


OSError: [Errno 12] Cannot allocate memory

In [12]:
# break genes into overlapping windows of fixed size, padding end with 0s
window_length = 10
overlap = 1
step_size = window_length-overlap

p = mp.Pool(n_threads)
func = partial(shingle, window_length, step_size)
X_CDS_shingled = p.map(func, X_CDS_int_array)
X_nonprot_shingled = p.map(func, X_nonprot_int_array)
p.close()
p.join()

OSError: [Errno 12] Cannot allocate memory

In [None]:
X_CDS_shingled_array = np.concatenate(X_CDS_shingled)
X_nonprot_shingled_array = np.concatenate(X_nonprot_shingled)

In [None]:
# create labels
y_CDS = np.ones(len(X_CDS_shingled_array), dtype=int)
y_nonprot = np.zeros(len(X_nonprot_shingled_array), dtype=int)

X = np.concatenate([X_CDS_shingled_array, X_nonprot_shingled_array])
y = np.concatenate([y_CDS, y_nonprot])

In [None]:
# split train and test data
print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2019)

# print(X_train)
# print(y_train)

In [None]:
# save data
X_train_path = os.path.join(data_dir, 'X_train_singlestrain_nooverlap_len10_threeprime.npy')
X_test_path = os.path.join(data_dir, 'X_test_singlestrain_nooverlap_len10_threeprime.npy')
y_train_path = os.path.join(data_dir, 'y_train_singlestrain_nooverlap_len10_threeprime.npy')
y_test_path = os.path.join(data_dir, 'y_test_singlestrain_nooverlap_len10_threeprime.npy')

np.save(X_train_path, X_train)
np.save(X_test_path, X_test)
np.save(y_train_path, y_train)
np.save(y_test_path, y_test)