In [1]:
import shutil
import os

import torch
import torch.nn as nn

import boda

import pandas as pd
import numpy as np
import csv
from scipy.stats import pearsonr
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt

In [2]:
def load_model(artifact_path):
    
    USE_CUDA = torch.cuda.device_count() >= 1
    if os.path.isdir('./artifacts'):
        shutil.rmtree('./artifacts')

    boda.common.utils.unpack_artifact(artifact_path)

    model_dir = './artifacts'

    my_model = boda.common.utils.model_fn(model_dir)
    my_model.eval()
    if USE_CUDA:
        my_model.cuda()
    
    return my_model

In [3]:
malinois_path = 'gs://tewhey-public-data/CODA_resources/malinois_model__20211113_021200__287348.tar.gz'
my_model = load_model(malinois_path)

Copying gs://tewhey-public-data/CODA_resources/malinois_model__20211113_021200__287348.tar.gz...
/ [1 files][ 49.3 MiB/ 49.3 MiB]                                                
Operation completed over 1 objects/49.3 MiB.                                     
archive unpacked in ./


Loaded model from 20211113_021200 in eval mode


In [4]:
left_flank = boda.common.utils.dna2tensor( 
    boda.common.constants.MPRA_UPSTREAM[-200:] 
).unsqueeze(0)
print(f'left flank shape: {left_flank.shape}')

right_flank= boda.common.utils.dna2tensor( 
    boda.common.constants.MPRA_DOWNSTREAM[:200] 
).unsqueeze(0)
right_flank.shape
print(f'right flank shape: {right_flank.shape}')

flank_builder = boda.common.utils.FlankBuilder(
    left_flank=left_flank,
    right_flank=right_flank,
)

flank_builder.cuda()

left flank shape: torch.Size([1, 4, 200])
right flank shape: torch.Size([1, 4, 200])


FlankBuilder()

In [5]:
fn_in = '/home/ubuntu/boda2/analysis/AR001__rotation/dummy_test.tsv'
data_tsv = pd.read_csv(fn_in,sep='\t')

In [6]:
with open(fn_in,'r') as f:
    seq_tensor = torch.stack([ boda.common.utils.dna2tensor(line.split()[0]) for line in f ])

ValueError: 'S' is not in list

In [None]:
seq_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(seq_tensor),
    batch_size=8
)

In [None]:
results = []

with torch.no_grad():
    for i, batch in enumerate(tqdm.tqdm(seq_loader)):
        prepped_seq = flank_builder( batch[0].cuda() )
        predictions = my_model( prepped_seq ) + \
                      my_model( prepped_seq.flip(dims=[1,2]) ) # Also
        predictions = predictions.div(2.)
        results.append(predictions.detach().cpu())
                
predictions = torch.cat(results, dim=0)

In [None]:
pred_df = pd.DataFrame( predictions.numpy(), columns=['K562_preds', 'HepG2_preds', 'SKNSH_preds'] )
pred_df

In [None]:
all_results = pd.concat([data_tsv, pred_df], axis=1)
all_results