In [1]:
import sys
import os
import subprocess
import tarfile
import shutil
import random
from functools import partial
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (random_split, DataLoader, TensorDataset, ConcatDataset)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits import mplot3d
from Bio import motifs
import pickle
from datetime import datetime
import scipy.stats as stats
import math

import boda
from boda.common import constants, utils

boda_src = os.path.join( os.path.dirname( os.path.dirname( os.getcwd() ) ), 'src' )
sys.path.insert(0, boda_src)

from main import unpack_artifact, model_fn
from pymeme import streme, parse_streme_output

from torch.distributions.categorical import Categorical
from boda.generator.plot_tools import matrix_to_dms, ppm_to_IC, ppm_to_pwm, counts_to_ppm

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
if os.path.isdir('./artifacts'):
    shutil.rmtree('./artifacts')
hpo_rec = 'gs://syrgoth/aip_ui_test/model_artifacts__20211113_021200__287348.tar.gz'
unpack_artifact(hpo_rec)

model_dir = './artifacts'
model = model_fn(model_dir)
#model.cuda()
model.eval()
print('')

Loaded model from 20211113_021200 in eval mode



archive unpacked in ./


In [3]:
class mpra_predictor(nn.Module):
    def __init__(self,
                 model,
                 pred_idx=0,
                 ini_in_len=200,
                 model_in_len=600,
                 cat_axis=-1,
                 dual_pred=False):
        super().__init__()
        self.model = model
        self.pred_idx = pred_idx
        self.ini_in_len = ini_in_len 
        self.model_in_len = model_in_len
        self.cat_axis = cat_axis  
        self.dual_pred = dual_pred
        
        try: self.model.eval()
        except: pass
        
        self.register_flanks()
    
    def forward(self, x):
        pieces = [self.left_flank.repeat(x.shape[0], 1, 1), x, self.right_flank.repeat(x.shape[0], 1, 1)]
        in_tensor = torch.cat( pieces, axis=self.cat_axis)
        if self.dual_pred:
            dual_tensor = utils.reverse_complement_onehot(in_tensor)
            out_tensor = self.model(in_tensor)[:, self.pred_idx] + self.model(dual_tensor)[:, self.pred_idx]
            out_tensor = out_tensor / 2.0
        else:
            out_tensor = self.model(in_tensor)[:, self.pred_idx]
        return out_tensor
    
    def register_flanks(self):
        missing_len = self.model_in_len - self.ini_in_len
        left_idx = - missing_len//2 + missing_len%2
        right_idx = missing_len//2 + missing_len%2
        left_flank = utils.dna2tensor(constants.MPRA_UPSTREAM[left_idx:]).unsqueeze(0)
        right_flank = utils.dna2tensor(constants.MPRA_DOWNSTREAM[:right_idx]).unsqueeze(0)         
        self.register_buffer('left_flank', left_flank)
        self.register_buffer('right_flank', right_flank) 

        
def df_to_onehot_tensor(in_df, seq_column='nt_sequence'):
    onehot_sequences = torch.stack([utils.dna2tensor(subsequence) \
                                for subsequence in tqdm(in_df[seq_column])])
    return onehot_sequences

#for variable-length sequences
def get_onehots(in_df, seq_column='nt_sequence', extra_str=''):
    padding_fn = partial(utils.row_pad_sequence,
                            in_column_name=seq_column,
                            padded_seq_len=600)    
    print('Padding sequences' + extra_str)
    sequence_list = list(in_df.progress_apply(padding_fn, axis=1))     
    print('Tokenizing sequences' + extra_str)
    onehot_sequences = torch.stack([utils.dna2tensor(subsequence) for subsequence in tqdm(sequence_list)])
    return onehot_sequences

def fasta_to_tensor(file_name):
    fasta_dict = {}
    with open(file_name, 'r') as f:
        for line in f:
            line_str = str(line)
            if line_str[0] == '>':
                my_id = line_str.lstrip('>').rstrip('\n')
                fasta_dict[my_id] = ''
            else:
                fasta_dict[my_id] += line_str.rstrip('\n')
    seq_tensors = []
    for sequence in list(fasta_dict.values()):
        seq_tensors.append(utils.dna2tensor(sequence))
    return torch.stack(seq_tensors, dim=0)

def dna2tensor_approx(sequence_str, vocab_list=constants.STANDARD_NT, N_value=0.25):
    seq_tensor = np.zeros((len(vocab_list), len(sequence_str)))
    for letterIdx, letter in enumerate(sequence_str):
        try:
            seq_tensor[vocab_list.index(letter), letterIdx] = 1
        except:
            seq_tensor[:, letterIdx] = N_value
    seq_tensor = torch.Tensor(seq_tensor)
    return seq_tensor

def frame_print(string, marker='*', left_space=25):
    left_spacer = left_space * ' '
    string = marker + ' ' + string.upper() + ' ' + marker
    n = len(string)
    print('', flush=True)
    print('', flush=True)
    print(left_spacer + n * marker, flush=True)
    print(left_spacer + string, flush=True)
    print(left_spacer + n * marker, flush=True)
    print('', flush=True)
    print('', flush=True)
    
def decor_print(string):
    decor = 15*'-'
    print('', flush=True)
    print(decor + ' ' + string + ' ' + decor, flush=True)
    print('', flush=True)

def isg_contributions(sequences,
                      predictor,
                      num_steps=50,
                      num_samples=20,
                      eval_batch_size=1024,
                      theta_factor=15):
    
    batch_size = eval_batch_size // num_samples
    temp_dataset = TensorDataset(sequences)
    temp_dataloader = DataLoader(temp_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    all_salient_maps = []
    all_gradients = []
    for local_batch in tqdm(temp_dataloader):
        target_thetas = (theta_factor * local_batch[0].cuda()).requires_grad_()
        #base_thetas = theta_factor / 3 * torch.ones_like(target_thetas)
        line_gradients = []
        for i in range(0, num_steps + 1):
            point_thetas = (i / num_steps * target_thetas)
            #point_thetas = base_thetas + i / num_steps * (target_thetas - base_thetas)
            point_distributions = F.softmax(point_thetas, dim=-2)

            nucleotide_probs = Categorical(torch.transpose(point_distributions, -2, -1))
            sampled_idxs = nucleotide_probs.sample((num_samples, ))
            sampled_nucleotides_T = F.one_hot(sampled_idxs, num_classes=4)
            sampled_nucleotides = torch.transpose(sampled_nucleotides_T, -2, -1)
            distribution_repeater = point_distributions.repeat(num_samples, *[1 for i in range(3)])
            sampled_nucleotides = sampled_nucleotides - distribution_repeater.detach() + distribution_repeater 
            samples = sampled_nucleotides.flatten(0,1)

            preds = predictor(samples)
            point_predictions = preds.unflatten(0, (num_samples, target_thetas.shape[0])).mean(dim=0)
            point_gradients = torch.autograd.grad(point_predictions.sum(), inputs=point_thetas, retain_graph=True)[0]
            line_gradients.append(point_gradients)
            
        gradients = torch.stack(line_gradients).mean(dim=0).detach()
        all_salient_maps.append(gradients * target_thetas.detach())
        all_gradients.append(gradients)
    return torch.cat(all_salient_maps).cpu(), theta_factor * torch.cat(all_gradients).cpu()

In [4]:
#df = pd.read_csv('gs://syrgoth/data/MPRA_ALL_no_cutoffs_v2_pred.txt', sep=' ', low_memory=False)
#df = pd.read_csv('boda_test_no_cutoffs_pred_contributions.txt', sep=' ', low_memory=False)
df = pd.read_csv('gs://syrgoth/data/boda_dataset_contributions/boda2_test__contributions.txt', sep=' ', low_memory=False)

df['seq_len'] = df.apply(lambda x: len(x['nt_sequence']), axis=1)
df['chr'] = df['chr'].astype(str)

In [5]:
df

Unnamed: 0,HepG2_mean,HepG2_std,ID_count,IDs,K562_mean,K562_std,OL,OL_count,SKNSH_mean,SKNSH_std,...,K562_pred_rc,HepG2_pred_rc,SKNSH_pred_rc,K562_pred_aggreg,HepG2_pred_aggreg,SKNSH_pred_aggreg,seq_len,K562_contrib,HepG2_contrib,SKNSH_contrib
0,0.233601,,1,7:70038969:G:T:A:wC,0.060779,,29,1.0,0.047194,,...,0.041816,0.540251,0.484300,0.022539,0.491981,0.470595,200,"-0.012306637,-0.050187174,-0.01816756,-0.01053...","-0.028826864,-0.075913034,-0.061179936,0.00187...","-0.036723685,-0.054105897,-0.05668138,0.006164..."
1,0.154811,0.078127,1,7:4682252:T:TTTG:A:wC,0.251481,0.10549,2732,2.0,0.198428,0.049655,...,0.391024,0.402064,0.268580,0.379942,0.362180,0.274369,200,"-0.0067605977,0.03572863,0.0013879802,0.006542...","-0.016802695,0.0385122,-0.018025868,0.01784802...","-0.04530336,0.034262598,-0.04083056,0.01107957..."
2,0.264553,0.184777,1,7:4682252:T:TTTG:R:wC,0.207406,0.12414,2732,2.0,0.194314,0.158508,...,0.199648,0.156504,0.071717,0.310800,0.285539,0.167666,197,"0.0057299053,0.030307656,0.0020277216,0.001448...","0.0032582,0.03015598,-0.02216963,0.0108725,0.0...","-0.03317056,0.015516815,-0.04512479,0.00156917..."
3,-0.159498,,1,7:26792481:T:G:A:wC,0.569506,,32,1.0,0.244310,,...,-0.192111,-0.150861,-0.211731,-0.160767,-0.162738,-0.116001,200,"0.0039080554,0.044442225,-0.011113109,0.004275...","-0.0032633895,0.038231984,-0.031737227,0.00506...","-0.024353493,0.039599378,-0.046023354,-0.00230..."
4,-0.382856,,1,7:26792481:T:G:R:wC,-0.125233,,32,1.0,-0.415536,,...,-0.184702,-0.141543,-0.216644,-0.160147,-0.156699,-0.150761,200,"0.012926111,0.03672994,0.005198022,0.007878459...","-0.0021027706,0.04600862,-0.030852437,-0.00031...","-0.03761421,0.039541256,-0.051351454,0.0063628..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66707,0.262422,,1,7:91906182:NA:NA,-0.331317,,,,-0.074696,,...,-0.019915,-0.057829,-0.211446,-0.097384,-0.077588,-0.216747,200,"0.033496596,0.03806614,0.035805956,0.010916623...","0.045250285,0.07283683,0.020448843,0.03349431,...","0.015901385,0.04505912,0.025725035,0.022753198..."
66708,0.677389,,1,7:128762416:NA:NA,0.877790,,,,0.870688,,...,0.420494,0.702932,0.772400,0.726636,0.960361,0.902817,200,"0.028213603,0.047467977,-0.0043974635,-0.02029...","0.040253557,0.051293284,0.02820068,-0.00336073...","0.015063139,0.03302522,0.02900612,-0.014296194..."
66709,-0.180785,,1,7:91906782:NA:NA,-0.248728,,,,0.293823,,...,-0.208400,-0.190216,-0.129537,-0.211949,-0.203565,-0.167556,200,"0.0137723535,0.021268066,0.0045164423,0.006391...","0.032113675,0.043639675,0.022883032,0.01750791...","-0.011138435,0.018981276,0.037225533,0.0027572..."
66710,-0.743857,,1,13:98991732:NA:NA,-0.502905,,,,-0.773449,,...,-0.123884,-0.111984,-0.291187,-0.156639,-0.115783,-0.268759,200,"0.013244069,0.05363087,0.010580196,0.013944145...","0.024629086,0.044879362,0.03751364,0.011698565...","-0.019024605,0.028306656,0.030200318,-0.006420..."


In [6]:
%%time

eval_batch_size = 1040
column_names = ['K562_contrib', 'HepG2_contrib', 'SKNSH_contrib']
cell_names = ['K562', 'HepG2', 'SKNSH']

out_dict = {}
for length in sorted(df['seq_len'].unique()):#, reverse=True): 
    #batch_contributions = []
    length_filter = df['seq_len'] == length
    temp_df = df[length_filter]
    sequences = temp_df['nt_sequence'].tolist()
    IDs = temp_df['IDs'].tolist()
    idxs = temp_df.index.tolist()
    num_sequences = len(sequences)    
    print(f'---------- Processing {num_sequences} sequences of length {length} ----------')
    print(f'Tokenizing sequences:')
    onehot_sequences = df_to_onehot_tensor(temp_df)
    mpra_lfc = torch.FloatTensor(df[['K562_mean', 'HepG2_mean', 'SKNSH_mean']].to_numpy())
    pred_lfc = torch.FloatTensor(df[['K562_pred_aggreg', 'HepG2_pred_aggreg', 'SKNSH_pred_aggreg']].to_numpy())
    batch_dict = {'batch_size':num_sequences, 'idxs':idxs, 'IDs': IDs, 'sequences': sequences, 'batch': onehot_sequences,
                 'mpra_lfc':mpra_lfc, 'pred_lfc':pred_lfc}
    predictors = [mpra_predictor(model=model, pred_idx=cell_type, ini_in_len=length).cuda() for cell_type in range(3)]
    for cell_idx, predictor in enumerate(predictors):
        column_name = column_names[cell_idx]
        ext_column_name = column_name + '_ext'
        print(f'Getting {column_name}:')
        contributions, extended_contributions = isg_contributions(onehot_sequences, predictor, eval_batch_size=eval_batch_size)
        #flat_contributions = contributions.numpy().sum(axis=-2)
        #df.loc[length_filter, column_name] = [','.join([str(score) for score in flat_contributions[seq_idx,:]]) for seq_idx in range(num_sequences)]
        batch_dict[column_name] = contributions
        batch_dict[ext_column_name] = extended_contributions 
        if length == 200:
            torch.save(batch_dict, 'temp_ext_contributions_dict.pt')
        else:
            pass
    batch_key_name = f'batch_length_{length}'
    out_dict[batch_key_name] = batch_dict
   
    #torch.save(out_dict, 'boda2_test__extended_contributions_dict.pt')

---------- Processing 1 sequences of length 144 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 155 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 156 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 157 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 2 sequences of length 160 ----------
Tokenizing sequences:


  0%|          | 0/2 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 161 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 162 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 2 sequences of length 163 ----------
Tokenizing sequences:


  0%|          | 0/2 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 165 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 166 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 2 sequences of length 168 ----------
Tokenizing sequences:


  0%|          | 0/2 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 169 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 170 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 2 sequences of length 171 ----------
Tokenizing sequences:


  0%|          | 0/2 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 172 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 173 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 1 sequences of length 174 ----------
Tokenizing sequences:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 4 sequences of length 175 ----------
Tokenizing sequences:


  0%|          | 0/4 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 3 sequences of length 176 ----------
Tokenizing sequences:


  0%|          | 0/3 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 3 sequences of length 177 ----------
Tokenizing sequences:


  0%|          | 0/3 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 6 sequences of length 178 ----------
Tokenizing sequences:


  0%|          | 0/6 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 6 sequences of length 179 ----------
Tokenizing sequences:


  0%|          | 0/6 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 6 sequences of length 180 ----------
Tokenizing sequences:


  0%|          | 0/6 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 5 sequences of length 181 ----------
Tokenizing sequences:


  0%|          | 0/5 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 6 sequences of length 182 ----------
Tokenizing sequences:


  0%|          | 0/6 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 9 sequences of length 183 ----------
Tokenizing sequences:


  0%|          | 0/9 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 18 sequences of length 184 ----------
Tokenizing sequences:


  0%|          | 0/18 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 6 sequences of length 185 ----------
Tokenizing sequences:


  0%|          | 0/6 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 8 sequences of length 186 ----------
Tokenizing sequences:


  0%|          | 0/8 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 9 sequences of length 187 ----------
Tokenizing sequences:


  0%|          | 0/9 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 18 sequences of length 188 ----------
Tokenizing sequences:


  0%|          | 0/18 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 29 sequences of length 189 ----------
Tokenizing sequences:


  0%|          | 0/29 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 30 sequences of length 190 ----------
Tokenizing sequences:


  0%|          | 0/30 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 15 sequences of length 191 ----------
Tokenizing sequences:


  0%|          | 0/15 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 29 sequences of length 192 ----------
Tokenizing sequences:


  0%|          | 0/29 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 40 sequences of length 193 ----------
Tokenizing sequences:


  0%|          | 0/40 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1 [00:00<?, ?it/s]

---------- Processing 78 sequences of length 194 ----------
Tokenizing sequences:


  0%|          | 0/78 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/2 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/2 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/2 [00:00<?, ?it/s]

---------- Processing 70 sequences of length 195 ----------
Tokenizing sequences:


  0%|          | 0/70 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/2 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/2 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/2 [00:00<?, ?it/s]

---------- Processing 280 sequences of length 196 ----------
Tokenizing sequences:


  0%|          | 0/280 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/6 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/6 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/6 [00:00<?, ?it/s]

---------- Processing 229 sequences of length 197 ----------
Tokenizing sequences:


  0%|          | 0/229 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/5 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/5 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/5 [00:00<?, ?it/s]

---------- Processing 418 sequences of length 198 ----------
Tokenizing sequences:


  0%|          | 0/418 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/9 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/9 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/9 [00:00<?, ?it/s]

---------- Processing 1408 sequences of length 199 ----------
Tokenizing sequences:


  0%|          | 0/1408 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/28 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/28 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/28 [00:00<?, ?it/s]

---------- Processing 63958 sequences of length 200 ----------
Tokenizing sequences:


  0%|          | 0/63958 [00:00<?, ?it/s]

Getting K562_contrib:


  0%|          | 0/1230 [00:00<?, ?it/s]

Getting HepG2_contrib:


  0%|          | 0/1230 [00:00<?, ?it/s]

Getting SKNSH_contrib:


  0%|          | 0/1230 [00:00<?, ?it/s]

CPU times: user 5h 34min 8s, sys: 49.9 s, total: 5h 34min 58s
Wall time: 5h 35min 21s


In [51]:
%%time

big_string = ''.join(temp_df['nt_sequence'].tolist())
big_tensor = utils.dna2tensor(big_string)
batch_tensor = torch.stack([big_tensor[:, i:i+200] for i in range(0, big_tensor.shape[-1], 200)], axis=0)

CPU times: user 5.52 s, sys: 276 ms, total: 5.8 s
Wall time: 5.7 s


In [23]:
df['chr'] = df['chr'].astype(str)

In [26]:
#df.to_csv('boda_test_no_cutoffs_pred_contributions.txt', index=None, sep=' ', float_format='%.15f')