In [6]:
import numpy as np, os, sys
sys.path.append("..")

import torch
from torch import nn
from torch.nn import functional as F 

from src.utils.synthetic_seqdata import download_data, load_data, sequence_string_to_one_hot
from src.models import DeepBindCNN
from src.trainer import Trainer
from src.utils.datasets import DNASequenceDataset
from sklearn.metrics import roc_auc_score, roc_curve
from src.utils import metrics
from src.explain import Explainer

import matplotlib as mpl 
from matplotlib import pyplot as plt 
%matplotlib inline

In [7]:
# get data 
savedir = "./data"
# _=download_data(savedir)
Xs, Ys = load_data(savedir=savedir)
len(Xs['train']), Ys, len(Ys['train']), len(Ys['valid']), len(Ys['test'])

(14000,
 {'train': array([1., 1., 0., ..., 1., 0., 1.], dtype=float32),
  'valid': array([0., 1., 1., ..., 0., 0., 1.], dtype=float32),
  'test': array([0., 1., 0., ..., 1., 1., 1.], dtype=float32)},
 14000,
 2000,
 4000)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [10]:
config = {
    "batch_size": 32,
    "learning_rate": 0.001,
    "architecture": "deepbind",
    "dataset": "synthetic data",
    "epochs": 35,
    "patience": 3,
    }

In [11]:
# set up datasets
datasets = {}
for k in Xs:
    datasets[k] = DNASequenceDataset(sequences=Xs[k], labels=Ys[k], alphabet="ACGT")

# set up dataloaders 
loaders = {}
for k, dataset in datasets.items():
    if k == 'train':
        loaders[k] = torch.utils.data.DataLoader(dataset, batch_size=config['batch_size'], shuffle=True)
    else:
        loaders[k] = torch.utils.data.DataLoader(dataset, batch_size=config['batch_size'])
    
# set up the model, lossfn, optimizer, trainer 
model = DeepBindCNN(input_size=4, output_size=1, kernel_size=3)
lossfn = nn.CrossEntropyLoss()

model.load_state_dict(torch.load('best_model.pt'))
print(model)


DeepBindCNN(
  (conv1): Conv1d(4, 16, kernel_size=(3,), stride=(1,))
  (relu): ReLU()
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=16, out_features=1, bias=True)
)


In [12]:
explainer = Explainer(model, 0)

# Select a random sample from the test dataset
sample_index = np.random.randint(len(datasets['test']))
input_sequence, target_label = datasets['test'][sample_index]

saliency_scores = explainer.saliency_map(input_sequence)
saliency_scores

IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [14]:
for batch in loaders['test']:  # Assuming you want to calculate for the 'train' dataset
    inputs, labels = batch
    # Step 3: Move the input and labels to the device (GPU or CPU)
    inputs = inputs.to(device)
    labels = labels.to(device)

    # Step 4: Calculate the saliency map for the batch
    saliency_scores = explainer.saliency_map(inputs)

RuntimeError: Mismatch in shape: grad_output[0] has a shape of torch.Size([32, 4, 200]) and output[0] has a shape of torch.Size([32]).