In [None]:
""" Testing/Debugging File """

"""
Restart kernel after running
Only need to run once
"""
!pip install scikit-learn matplotlib seaborn

In [5]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.utils.data as td
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset, Subset, Sampler

In [6]:
filename = './data/k562_datasets.pkl'

with open(filename, 'rb') as file:
    combined_datasets = pickle.load(file)

train_data = combined_datasets['train']
valid_data = combined_datasets['valid']
test_data = combined_datasets['test']

print(train_data.iloc[0])

column_names = np.array(train_data.columns)
feature_names = column_names[6:16]
num_features = len(feature_names)
print(feature_names)
num_samples = train_data.shape[0]
nucleotides = ['A', 'T', 'G', 'C']

print("Number of Samples: " + str(num_samples))
print("Number of Features: " + str(num_features))

seqnames                         1
start                      1002760
end                        1002760
strand                           +
ensembl_gene_id    ENSG00000187608
score                          0.0
ctcf                      -0.07771
h4k20me1                 -0.429997
h3k79me2                   -0.2804
h3k4me1                  -0.217665
h3k9me3                  -0.333359
h3k36me3                 -0.801406
sj5                      -0.039619
sj3                      -0.059131
rpts                     -0.187111
wgbs                           0.0
lambda_alphaj             0.026377
zeta                      1.133344
A                                0
T                                0
G                                1
C                                0
dataset                      train
Name: 0, dtype: object
['ctcf' 'h4k20me1' 'h3k79me2' 'h3k4me1' 'h3k9me3' 'h3k36me3' 'sj5' 'sj3'
 'rpts' 'wgbs']
Number of Samples: 136927782
Number of Features: 10


In [7]:
cuda_available = torch.cuda.is_available()
print("CUDA (GPU support) is available:", cuda_available)
num_gpus = torch.cuda.device_count()
print("Number of GPUs available:", num_gpus)

CUDA (GPU support) is available: True
Number of GPUs available: 1


In [8]:
class GeneDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.gene_ids = dataframe['ensembl_gene_id'].unique()
        self.genes = dataframe.groupby('ensembl_gene_id')
        self.cache = {}

    def __len__(self):
        return len(self.gene_ids)

    def __getitem__(self, idx):
        gene_id = self.gene_ids[idx]
        gene = self.genes.get_group(gene_id)
                
        if gene_id in self.cache:
            return self.cache[gene_id]
 
        result = {
            'GeneId': gene_id,
            'Seq_Name': gene['seqnames'].iloc[0],
            'Start': gene['start'],
            'End': gene['end'],
            'Strand': gene['strand'],
            
            # epigenomic features per gene j, site i
            'Y_ji':  torch.tensor(gene[feature_names].values, dtype=torch.float64),
            
            # read counts per gene j, site i
            'X_ji': torch.tensor(gene['score'].values, dtype=torch.float64),
            
            # read depth * initiation rate values per gene j
            'C_j': torch.tensor(gene['lambda_alphaj'].iloc[0], dtype=torch.float64),
            
            # GLM elongation rate predictions per gene j, site i
            'Z_ji': torch.tensor(gene['zeta'].values, dtype=torch.float64),
            
            # one-hot encoded sequences
            'N_ji': torch.tensor(gene[nucleotides].values, dtype=torch.int64), 
            'Length': len(gene)
        }
    
        self.cache[gene_id] = result

        return result

In [56]:
from torch.utils.data import Sampler

class GeneBatchSampler(Sampler):
    def __init__(self, dataset, batch_size, bucket_size):
        self.dataset = dataset
        self.batch_size = batch_size
        self.bucket_size = bucket_size

        lengths = torch.tensor([dataset[i]['Length'] for i in range(len(dataset))])

        # Calculate the number of buckets based on bucket_size
        self.n_buckets = ((lengths.max() - lengths.min()) // bucket_size) + 1

        # Define boundaries for buckets
        self.boundaries = torch.arange(lengths.min(), lengths.max() + bucket_size, step=bucket_size)

        # Assign samples to buckets, returns 1-based bucket indices
        self.bucket_indices = torch.bucketize(lengths, self.boundaries, right=True)

        # Group indices by bucket
        self.buckets = [[] for _ in range(self.n_buckets)]
        for length_idx, bucket_idx in enumerate(self.bucket_indices):
            self.buckets[bucket_idx-1].append(length_idx)

    def __iter__(self):
        batch = []
        for bucket in self.buckets:
            for idx in bucket:
                batch.append(idx)
                if len(batch) == self.batch_size:
                    yield batch
                    batch = []
            
            if batch:  # Ensure last batch is returned if it has elements
                yield batch
                batch = []  # Reset for the next bucket

    # calculate number of batches created
    def __len__(self):
        # Include partial batch in calculation
        return sum((len(bucket) + self.batch_size - 1) // self.batch_size for bucket in self.buckets)



In [57]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    GeneIds, Seq_Names, Start, End, Strand, C_j, Lengths = zip(*[(item['GeneId'], item['Seq_Name'], item['Start'], item['End'], item['Strand'], item['C_j'], item['Length']) for item in batch])
    
    Y_ji = pad_sequence([item['Y_ji'] for item in batch], batch_first=True, padding_value=0.0)
    X_ji = pad_sequence([item['X_ji'] for item in batch], batch_first=True, padding_value=0.0)
    Z_ji = pad_sequence([item['Z_ji'] for item in batch], batch_first=True, padding_value=-1.0)
    N_ji = pad_sequence([item['N_ji'] for item in batch], batch_first=True, padding_value=-1)
    
    mask = torch.zeros_like(X_ji, dtype=torch.bool)
    for i, length in enumerate(Lengths):
        mask[i, :length] = 1
    
    return {
        'GeneId': GeneIds,
        'Seq_Name': Seq_Names,
        'Start': Start,
        'End': End,
        'Strand': Strand,
        'Y_ji': Y_ji,
        'X_ji': X_ji,
        'C_j': C_j,
        'Z_ji': Z_ji,
        'N_ji': N_ji,
        'Mask': mask,
        'Length': torch.tensor(Lengths, dtype=torch.int64)
    }

In [58]:
dataset = GeneDataset(test_data)
batch_sampler = GeneBatchSampler(dataset, batch_size=32, bucket_size=2000)
loader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=7, collate_fn=collate_fn)

In [None]:
def build_dataset(data):
    dataset = GeneDataset(data)
    batch_sampler = GeneBatchSampler(dataset, batch_size=32, drop_last=False)
    loader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=7, collate_fn=collate_fn)
    return loader

In [None]:
import math 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def build_model(model_type, num_lstm_layers, bidirectional, hidden_layer_sizes, activation_func,
                filter_size, pool_type, pool_size, dropout, window_size, weight_init):
    
    class LinearModel(nn.Module):
        def __init__(self, input_size):
            super(LinearModel, self).__init__()
            self.name = "linear"
            self.linear = nn.Linear(input_size, 1, bias=False)

        def forward(self, x):
            x = self.linear(x)
            return x
    
    class LSTMModel(nn.Module):
        def __init__(self, input_size, hidden_layer_sizes, output_size, num_layers, bidirectional):
            super(LSTMModel, self).__init__()
            self.name = "lstm"
            self.lstm = nn.LSTM(input_size, hidden_layer_sizes[0], num_layers, bidirectional=bidirectional, batch_first=True)
            self.bidirectional_linear = nn.Linear(2 * hidden_layer_sizes[0], output_size)
            self.linear = nn.Linear(hidden_layer_sizes[0], output_size)
            self.bidirectional = bidirectional

        def forward(self, x):
            x, _ = self.lstm(x)
            if self.bidirectional:
                x = self.bidirectional_linear(x)
            else:
                x = self.linear(x)
            return x
        
    class DenseNet(nn.Module):
        def __init__(self, input_size, hidden_layer_sizes, output_size, dropout, activation='relu'):
            super(DenseNet, self).__init__()
            
            self.name = "dense"
            
            layers = []

            # Define the input layer
            prev_size = input_size

            for size in hidden_layer_sizes:
                layers.append(nn.Linear(prev_size, size))

                if activation.lower() == 'leakyrelu':
                    layers.append(nn.LeakyReLU())
                elif activation.lower() == 'relu':
                    layers.append(nn.ReLU())
                elif activation.lower() == 'tanh':
                    layers.append(nn.Tanh())
                else:
                    raise ValueError("Unsupported activation function")

                prev_size = size
                
            layers.append(nn.Dropout(0.5))

            layers.append(nn.Linear(prev_size, output_size))

            self.layers = nn.Sequential(*layers)

        def forward(self, x):
            return self.layers(x)

    class DeepChromeCNN(nn.Module): 
        def __init__(self, input_size, hidden_layer_sizes, filter_size, pool_size, dropout, window_size):
            super(DeepChromeCNN, self).__init__()
            self.name = "dc_cnn"
            self.conv1d = nn.Conv1d(input_size, hidden_layer_sizes[0], filter_size)
            self.relu = nn.ReLU()
            self.maxpool1d = nn.MaxPool1d(pool_size)
            
            self.dropout = nn.Dropout(dropout)
            self.linear1_input_size = math.ceil((window_size-filter_size)/pool_size)*hidden_layer_sizes[0]
            self.linear1 = nn.Linear(self.linear1_input_size, hidden_layer_sizes[1])
            self.linear2 = nn.Linear(hidden_layer_sizes[1], hidden_layer_sizes[2])
            self.linear3 = nn.Linear(hidden_layer_sizes[2], window_size)
            

        def forward(self, x):
            x = x.permute(0, 2, 1)
            x = self.conv1d(x)
            x = self.relu(x)
            x = self.maxpool1d(x)
            
            x = x.view(-1, self.linear1_input_size)
            x = self.dropout(x)
            x = self.linear1(x)
            x = self.relu(x)
            x = self.linear2(x)
            x = self.relu(x)
            x = self.linear3(x)
            
            return x
        
    class CNN(nn.Module): 
        def __init__(self, input_size, hidden_layer_sizes, filter_size, pool_type, pool_size, dropout, window_size):
            super(CNN, self).__init__()
            self.name = "cnn"
            self.window_size = window_size
            self.num_hidden_layers = len(hidden_layer_sizes)
            self.pool_type = pool_type
            
            in_channels = input_size
            self.convs = nn.ModuleList()
            for out_channels in hidden_layer_sizes:
                self.convs.append(
                    nn.Conv1d(in_channels, out_channels, filter_size)
                )
                in_channels = out_channels
                
            self.relu = nn.ReLU()
            self.maxpool1d = nn.MaxPool1d(pool_size)
            self.avgpool1d = nn.AvgPool1d(pool_size)
            self.dropout = nn.Dropout(dropout)
            self.linear1_input_size = self.get_linear1_input_size(hidden_layer_sizes, window_size, filter_size, pool_size)
            self.linear1 = nn.Linear(self.linear1_input_size, window_size)
        
        def get_linear1_input_size(self, hidden_layer_sizes, window_size, filter_size, pool_size):
            num_hidden_layers = len(hidden_layer_sizes)
            dim1_size = hidden_layer_sizes[-1]
            dim2_size = window_size
            for _ in range(num_hidden_layers):
                # cnn layer filter shrinks dim (e.g. 100 -> 91)
                dim2_size = dim2_size - filter_size + 1
                # max pool rounds down for pooling (grouping by pool_size)
                dim2_size = math.floor(dim2_size / pool_size)
            # linear layer takes flattened [x, y] -> [x * y]
            return dim1_size * dim2_size
            
        def forward(self, x):
            x = x.permute(0, 2, 1)
            batch_size = x.shape[0]
            for conv in self.convs:
                x = self.relu(conv(x))
                if pool_type == 'max':
                    x = self.maxpool1d(x)
                else:
                    x = self.avgpool1d(x)

            x = x.view(-1, self.linear1_input_size)
            x = self.dropout(x)
            x = self.linear1(x)
            
            return x
    
    if model_type == 'lstm':
        model = LSTMModel(num_features, hidden_layer_sizes, 1, num_lstm_layers, bidirectional)
    elif model_type == 'linear':
        model = LinearModel(num_features)
    elif model_type == 'dense':
        model = DenseNet(num_features, hidden_layer_sizes, 1, activation_func)
    elif model_type == 'dc_cnn':
        model = DeepChromeCNN(num_features, hidden_layer_sizes, filter_size, pool_size, dropout, window_size)
    elif model_type == 'cnn':
        model = CNN(num_features, hidden_layer_sizes, filter_size, pool_type, pool_size, dropout, window_size)
        
    
    if cuda_available:
        #if num_gpus > 1:
            #print("Using", num_gpus, "GPUs")
            #model = torch.nn.DataParallel(model)
        model = model.to('cuda')

    print(model)
    
    """
    # print # model parameters
    arr = torch.randn((1,12,2000)).to(device)
    print(model(arr).shape)
    nparm = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Number of parameters: " + str(nparm))
    """

    first_param_device = next(model.parameters()).device
    print("Model is on device:", first_param_device)
    
    # expected weights are close to 0 which is why 0 initializing weights converges much quicker
    if weight_init == 'zero':
        with torch.no_grad():
            for param in model.parameters():
                param.zero_()
    
    model.double()

    return model.to(device)

In [None]:
def build_optimizer(network, optimizer, learning_rate, momentum):
    if optimizer == "sgd":
        optimizer = optim.SGD(network.parameters(),
                              lr=learning_rate, momentum=momentum)
        
    # Adam optimizer adapts the learning rate for each parameter individually
    elif optimizer == "adam":
        optimizer = optim.Adam(network.parameters(),
                               lr=learning_rate)
    return optimizer

In [None]:
def valid_epoch(model, loader, loss_fn):
    model.eval()
    total_neural_net_loss = 0
    total_glm_loss = 0
    neural_net_zeta = []
    glm_zeta = []
    
    with torch.no_grad():
        for idx, batch in enumerate(loader):
            Y_ji_batch = batch['Y_ji'].to(device)
            X_ji_batch = batch['X_ji'].to(device)
            C_j_batch = batch['C_j'].to(device)
            Z_ji_batch = batch['Z_ji'].to(device)
            lengths = batch['gene_length'].to(device)
            
            outputs = model(Y_ji_batch)
            
            if model.name == 'dc_cnn' or model.name == 'cnn':
                rho_ji = outputs
            else:
                rho_ji = outputs.squeeze(2)
            
            neural_net_loss = loss_fn(X_ji_batch, C_j_batch, rho_ji, lengths)
            glm_loss = loss_fn(X_ji_batch, C_j_batch, torch.log(Z_ji_batch), lengths)

            total_neural_net_loss +=  neural_net_loss.item()
            total_glm_loss += glm_loss.item()
            
            # store all predictions in list
            neural_net_zeta.append(torch.exp(outputs.cpu()[0]))
            glm_zeta.append(batch['Z_ji'][0])
    
    # calculate average loss across all batches
    avg_neural_net_loss = total_neural_net_loss / len(loader)
    avg_glm_loss = total_glm_loss / len(loader)
    
    neural_net_zeta = torch.cat(neural_net_zeta, dim=0)
    glm_zeta = torch.cat(glm_zeta, dim=0)
    
    return avg_neural_net_loss, avg_glm_loss, neural_net_zeta, glm_zeta

In [None]:
def train_epoch(model, loader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    for idx, batch in enumerate(loader):
        optimizer.zero_grad()
        Y_ji_batch = batch['Y_ji'].to(device) 
        X_ji_batch = batch['X_ji'].to(device)
        C_j_batch = batch['C_j'].to(device)
        lengths = batch['gene_length'].to(device)
        
        outputs = model(Y_ji_batch)
        
        if model.name == 'dc_cnn' or model.name == 'cnn':
            rho_ji = outputs
        else:
            rho_ji = outputs.squeeze(2)
        
        loss = loss_fn(X_ji_batch, C_j_batch, rho_ji, lengths)
        loss.backward()
        optimizer.step()
        
        # calculate average loss across all batches
        total_loss += loss.item()
    avg_train_loss = total_loss / len(loader)
    
    return avg_train_loss

In [None]:
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()

    def forward(self, X_ji, C_j, rho_ji, lengths):
        C_j_value = C_j[0]
        loss = X_ji * rho_ji + C_j_value * torch.exp(-rho_ji) - X_ji * torch.log(C_j_value)
        
        # normalize loss by sequence length
        loss_sum = loss.sum(dim=1)
        normalized_loss = loss_sum / lengths.float()
        
        # calculate average loss within each batch
        return (loss).mean()

In [None]:
# model configs
model_type = 'lstm'
weight_init = None
hidden_layer_sizes = [9]
activation_func = 'relu'
pool_type = 'max'

# lstm configs
num_lstm_layers = 3
bidirectional = True

# deep chrome cnn configs
filter_size = 10
pool_size = 5
dropout = 0.5

# dataset configs
use_sliding_window = True
window_size = 1000

# optimizer configs
learning_rate = 1e-3
optimizer_type = 'adam'
momentum = 0

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
epochs = 20

def train():
    model = build_model(model_type, num_lstm_layers, bidirectional, hidden_layer_sizes, 
                        activation_func, filter_size, pool_type, pool_size, dropout, window_size, weight_init)
    
    train_loader = build_dataset(train_data, use_sliding_window, window_size)
    valid_loader = build_dataset(valid_data, use_sliding_window, window_size)
    
    optimizer = build_optimizer(model, optimizer_type, learning_rate, momentum)
    
    loss_fn = CustomLoss()
    # track loss curves
    loss_neural_net_train = [0] * epochs
    loss_neural_net_valid = [0] * epochs
    loss_glm_valid = [0] * epochs
    
    # scheduler to reduce learning rate by half when new validation loss > old validation loss
    old_train_loss = float('inf')
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

    for epoch in range(epochs):
        print(f'Epoch {epoch+1}')
        
        train_loss = train_epoch(model, train_loader, optimizer, loss_fn)
        loss_neural_net_train[epoch] = train_loss
        print(f"train loss: {train_loss: .4f}")
        
        valid_neural_net_loss, valid_glm_loss, neural_net_zeta, glm_zeta = valid_epoch(model, valid_loader, loss_fn)
        loss_neural_net_valid[epoch] = valid_neural_net_loss
        loss_glm_valid[epoch] = valid_glm_loss
        print(f"valid neural net loss: {valid_neural_net_loss: .4f}")
        print(f"valid glm loss: {valid_glm_loss: .4f}")
        
        # compute metrics
        mae = F.l1_loss(neural_net_zeta.squeeze(), glm_zeta)
        mse = F.mse_loss(neural_net_zeta.squeeze(), glm_zeta)
        correlation_coefficient = np.corrcoef(glm_zeta, neural_net_zeta.squeeze())[0, 1]
        print("Correlation Coefficient:", correlation_coefficient)
        print(f"Mean Absolute Error: {mae.item():.4f}")
        print(f"Mean Squared Error: {mse.item():.4f}")
        
        # reduce learning rate if new loss > old loss
        if train_loss > old_train_loss:
            optimizer.param_groups[0]['lr'] *= 0.5
            print(f"Reduced learning rate to {optimizer.param_groups[0]['lr']}")
        old_train_loss = train_loss
        scheduler.step(train_loss)
        
    return model

In [None]:
model = train()

In [None]:
# save model parameters

from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"models/LSTM_Model.pth"
torch.save(model.state_dict(), filename)

In [None]:
# load model state

model = build_model(model_type, num_lstm_layers, bidirectional, hidden_layer_sizes, activation_func,
                filter_size, pool_type, pool_size, dropout, window_size, weight_init)

model.load_state_dict(torch.load("models/LSTM_Model.pth", map_location=torch.device('cpu')))

cuda_available = torch.cuda.is_available()
print("CUDA (GPU support) is available:", cuda_available)
num_gpus = torch.cuda.device_count()
print("Number of GPUs available:", num_gpus)
if cuda_available:
    if num_gpus > 1:
        print("Using", num_gpus, "GPUs")
        model = torch.nn.DataParallel(model)
    model = model.to('cuda')

first_param_device = next(model.parameters()).device
print("Model is on device:", first_param_device)

model.double()

In [None]:
weights = model.linear.weight.data.cpu().numpy()
combined = ', '.join([f'"{s}": {f}' for s, f in zip(feature_names, weights[0])])
print(combined)

In [None]:
glm_kappa = [-0.0224536145637661, -0.094592589, -0.023815382, 0.030402922, -0.067234092, -0.032196914, -0.040911478, -0.018557168, -0.033545905, -0.051103287, -0.204434712, 0.015831043]

In [None]:
"""
GLM K

* ctcf: -0.02
* h3k36me3: -0.09
* h3k4me1: -0.02
* h3k79me2: +0.03
* h3k9me1: -0.06
* h3k9me3: -0.03
* h4k20me1: -0.04
* sj5: -0.02
* sj3: -0.03
* dms->stem-loop: -0.05
* rpts->low-complex: +0.01
* wgbs->DNAm: -0.2
"""

In [None]:
# plot loss curve

epochs = range(1, len(loss_neural_net_train) + 1)
plt.plot(epochs, loss_train, label='train_neural_net_loss')
plt.plot(epochs, loss_neural_net_valid, label='valid_neural_net_loss')
plt.plot(epochs, loss_glm_valid, label='valid_glm_loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show 

In [None]:
def plot_data(glm_zeta, net_zeta):
    indices = range(len(glm_zeta))
    
    fig, ax = plt.subplots(figsize=(10, 5))
    
    ax.scatter(indices, net_zeta, color='blue', label='Neural Net Zeta', s=10, alpha=0.5)
    ax.scatter(indices, glm_zeta, color='orange', label='GLM Zeta', s=10, alpha=0.5)
    
    ax.set_title('Neural Net vs GLM Elongation Rate')
    ax.set_xlabel('Index')
    ax.set_ylabel('Elongation Rate')
    ax.legend()
    
    plt.ylim(0.5, 1.5)

    plt.show()


In [None]:
model.eval()

tstdl = build_dataset(test_data, use_sliding_window, window_size)
data_iter = iter(tstdl)

In [None]:
# compute metrics for test dataset

loss_fn = CustomLoss()

total_net_loss = 0
total_glm_loss = 0
with torch.no_grad():
    for batch in tstdl:
        y_inputs = batch['Y_ji'].to(device)
        X_ji = batch['X_ji'].to(device)
        C_j = batch['C_j'].to(device)
        lengths = batch['gene_length'].to(device)
        Z_ji = batch['Z_ji'].to(device)
        rho_ji = model(y_inputs)
        net_loss = loss_fn(X_ji, C_j, rho_ji.squeeze(2), lengths)
        glm_loss = loss_fn(X_ji, C_j, torch.log(Z_ji), lengths)
        
        total_net_loss += net_loss.item()
        total_glm_loss += glm_loss.item()



print(f"Neural Net Loss: {total_net_loss/len(tstdl):.4f}")
print(f"GLM Loss: {total_glm_loss/len(tstdl):.4f}")

In [None]:
# plot for subset of genes in test dataset
for i in range(0, 4):
    inputs = next(data_iter) 
    print("number of samples: " + str(len(inputs)))

    with torch.no_grad():
        y_inputs = inputs['Y_ji'].to(device)
        rho_ji = model(y_inputs)

    glm_zeta = inputs['Z_ji'][0]
    # convert log(Z) outputs to Z
    net_zeta = torch.exp(rho_ji.cpu().squeeze())
    
    plot_data(glm_zeta, net_zeta)

In [None]:
# compute metrics for test dataset

net_zeta = []
glm_zeta = []
with torch.no_grad():
    for batch in tstdl:
        y_inputs = batch['Y_ji'].to(device)
        rho_ji = model(y_inputs)
        # convert log(Z) outputs to Z
        net_zeta.append(torch.exp(rho_ji.cpu()[0]))
        glm_zeta.append(batch['Z_ji'][0])

net_zeta = torch.cat(net_zeta, dim=0)
glm_zeta = torch.cat(glm_zeta, dim=0)
mae = F.l1_loss(net_zeta.squeeze(), glm_zeta)
mse = F.mse_loss(net_zeta.squeeze(), glm_zeta)

correlation_coefficient = np.corrcoef(glm_zeta, net_zeta.squeeze())[0, 1]
print("Correlation Coefficient:", correlation_coefficient)

print(f"Mean Absolute Error: {mae.item():.4f}")
print(f"Mean Squared Error: {mse.item():.4f}")

In [None]:
def density_plot(glm_zeta, net_zeta, gene_id):
    sns.kdeplot(x=glm_zeta, y=net_zeta, fill=True, cmap="Blues")
            
    plt.xlim([min(glm_zeta), max(glm_zeta)])
    plt.ylim([min(net_zeta), max(net_zeta)])


    plt.xlabel('GLM Elongation Rate')
    plt.ylabel('Neural Net Elongation Rate')
    plt.title(gene_id)
    plt.show()

In [None]:
# plot for all genes in test dataset

total_loss = 0
loss_fn = CustomLoss()
for batch in tstdl:
    gene_id = batch['GeneId'][0]
    model.eval()

    with torch.no_grad():
        y_inputs = batch['Y_ji'].to(device)
        rho_ji = model(y_inputs)
        
    glm_zeta = batch['Z_ji']#[0]
    # convert log(Z) outputs to Z
    net_zeta = torch.exp(rho_ji.cpu().squeeze())
        
    density_plot(glm_zeta.flatten(), net_zeta.flatten(), gene_id)
                
    plot_data(glm_zeta.flatten(), net_zeta.flatten())

In [None]:
# plot scatterplot of neural net weights and glm weights

plt.figure(figsize=(10, 10))

sns.scatterplot(x=glm_kappa, y=weights[0])

for i in range(len(glm_kappa)):
    plt.text(glm_kappa[i], weights[0][i], feature_names[i], fontsize=13, ha='right', va='top')
plt.xlabel('GLM Weights')
plt.ylabel('Neural Net Weights')

max_val = max(np.max(glm_kappa), np.max(weights[0])) + 0.04
min_val = min(np.min(glm_kappa), np.min(weights[0])) - 0.04

plt.xlim(max_val, min_val)
plt.ylim(max_val, min_val)

# Show the plot
plt.show()