In [1]:
import shutil
import os

import torch
import torch.nn as nn

import boda

# Helpers

1. `load_model` checks GPUs, clears a spot for the model to be downloaded, downloads, and loads the model in `eval` mode.

2. `FlankBuilder` is used to pad inputs with MPRA vector backbone sequence. For technical reasons, Malinois reads 600 nt sequences (i.e., n x 4 x 600 inputs) but it should be 200 nt of variable sequence padded with MPRA backbone.

In [2]:
def load_model(artifact_path):
    
    USE_CUDA = torch.cuda.device_count() >= 1
    if os.path.isdir('./artifacts'):
        shutil.rmtree('./artifacts')

    boda.common.utils.unpack_artifact(artifact_path)

    model_dir = './artifacts'

    my_model = boda.common.utils.model_fn(model_dir)
    my_model.eval()
    if USE_CUDA:
        my_model.cuda()
    
    return my_model

# Get Malinois

Can download directly from a Google Storage bucket you can access.

In [3]:
malinois_path = 'gs://tewhey-public-data/CODA_resources/malinois_model__20211113_021200__287348.tar.gz'
my_model = load_model(malinois_path)

Copying gs://tewhey-public-data/CODA_resources/malinois_model__20211113_021200__287348.tar.gz...
\ [1 files][ 49.3 MiB/ 49.3 MiB]                                                
Operation completed over 1 objects/49.3 MiB.                                     
archive unpacked in ./


Loaded model from 20211113_021200 in eval mode


## Set flanks

MPRA flanks are saved as constants in the `boda` repo. These need to be sized to (1, 4, 200) each and used to init `FlankBuilder`.

In [4]:
left_flank = boda.common.utils.dna2tensor( 
    boda.common.constants.MPRA_UPSTREAM[-200:] 
).unsqueeze(0)
print(f'left flank shape: {left_flank.shape}')

right_flank= boda.common.utils.dna2tensor( 
    boda.common.constants.MPRA_DOWNSTREAM[:200] 
).unsqueeze(0)
right_flank.shape
print(f'right flank shape: {right_flank.shape}')

flank_builder = boda.common.utils.FlankBuilder(
    left_flank=left_flank,
    right_flank=right_flank,
)

flank_builder.cuda()

left flank shape: torch.Size([1, 4, 200])
right flank shape: torch.Size([1, 4, 200])


FlankBuilder()

# Example call

Using `torch.no_grad()` so the computation graph isn't saved to memory. Since sequences are passed to the model as onehots in `torch.float32` format, we can use `torch.randn` to validate the model setup. Here a batch of 10 variable 200 nt (fake) sequences are being padded to 600 nt, then being passed to the model. Note, `my_model` and `flank_builder` have been set on the GPU using `.cuda()` calls. Therefore, the fake sequence also needs to be sent to `cuda`.

In [5]:
with torch.no_grad():
    print( 
        my_model( 
            flank_builder(                     # Need to add MPRA flanks
                torch.randn((10,4,200)).cuda() # Simulate a batch_size x 4 nucleotide x 200 nt long sequence
            ) 
        ) 
    )

tensor([[-2.8178, -1.3312,  1.3796],
        [-1.4956, -1.1950,  4.2369],
        [ 0.0625, -0.8346,  4.8093],
        [-3.2229, -0.7095,  7.3412],
        [ 6.0309,  1.0885, 12.4849],
        [-1.0273, -0.7155,  9.6839],
        [-1.8505, -2.0176,  5.2422],
        [ 0.2424, -0.3303,  6.9106],
        [-0.0159, -0.0228, 10.6849],
        [-2.4961, -1.4846,  4.9746]], device='cuda:0')


# Run on MPRA data set

We're focusing on sequences that are 200 nt long for simplicity. 

In [6]:
import pandas as pd
import numpy as np
import csv
from scipy.stats import pearsonr
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt

In [None]:
pass_seq = pd.read_csv("/home/ubuntu/boda2/analysis/AR001__rotation/randseqs.tsv")

seq_tensor  = torch.stack([ boda.common.utils.dna2tensor(x[0]) for i, x in tqdm.tqdm(pass_seq.iterrows(), total=pass_seq.shape[0]) ], dim=0)
seq_dataset = torch.utils.data.TensorDataset(seq_tensor)
seq_loader  = torch.utils.data.DataLoader(seq_dataset, batch_size=128)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [None]:
results = []

with torch.no_grad():
    for i, batch in enumerate(tqdm.tqdm(seq_loader)):
        prepped_seq = pass_seq ## added flanks in diffusion code
        predictions = my_model( prepped_seq ) + \
                      my_model( prepped_seq.flip(dims=[1,2]) ) # Also
        predictions = predictions.div(2.)
        results.append(predictions.detach().cpu())
                
predictions = torch.cat(results, dim=0)

In [None]:
pred_df = pd.DataFrame( predictions.numpy(), columns=['K562_preds', 'HepG2_preds', 'SKNSH_preds'] )
pred_df

In [None]:
all_results = pd.concat([pass_seq, pred_df], axis=1)
all_results

# Test set performance
Check performance on chromosomes 7 and 13 (held-out for training and validation).

In [None]:
chr_filter = (all_results['chr'] == 7) | (all_results['chr'] == 13) | (all_results['chr'] == '7') | (all_results['chr'] == '13')


In [None]:
pearsonr(all_results.loc[ chr_filter,'K562_mean'], all_results.loc[ chr_filter, 'K562_preds'])

In [None]:
pearsonr(all_results.loc[ chr_filter,'HepG2_mean'], all_results.loc[ chr_filter, 'HepG2_preds'])

In [None]:
pearsonr(all_results.loc[ chr_filter,'SKNSH_mean'], all_results.loc[ chr_filter, 'SKNSH_preds'])

In [None]:
all_results.loc[:,['IDs', 'nt_sequence', 'K562_preds', 'HepG2_preds', 'SKNSH_preds']].to_csv('inference_check.tsv', sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE)
all_results.loc[:,['IDs', 'nt_sequence', 'K562_preds', 'HepG2_preds', 'SKNSH_preds']]