In [None]:
""" Testing/Debugging File """

"""
Restart kernel after running
Only need to run once
"""
!pip install scikit-learn matplotlib seaborn

In [9]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.utils.data as td
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset, Subset, Sampler

In [10]:
"""
import gc 

torch.cuda.empty_cache()  # Clear CUDA cache
gc.collect()  
"""

'\nimport gc \n\ntorch.cuda.empty_cache()  # Clear CUDA cache\ngc.collect()  \n'

In [11]:
filename = './data/k562_datasets.pkl'

with open(filename, 'rb') as file:
    combined_datasets = pickle.load(file)

train_data = combined_datasets['train']
valid_data = combined_datasets['valid']
test_data = combined_datasets['test']

print(train_data.iloc[0])

column_names = np.array(train_data.columns)
feature_names = column_names[6:16]
num_features = len(feature_names)
print(feature_names)
num_samples = train_data.shape[0]
nucleotides = ['A', 'T', 'G', 'C']
num_seq_features = 4

print("Number of Samples: " + str(num_samples))
print("Number of Features: " + str(num_features))

seqnames                         1
start                      1002760
end                        1002760
strand                           +
ensembl_gene_id    ENSG00000187608
score                          0.0
ctcf                      -0.07771
h4k20me1                 -0.429997
h3k79me2                   -0.2804
h3k4me1                  -0.217665
h3k9me3                  -0.333359
h3k36me3                 -0.801406
sj5                      -0.039619
sj3                      -0.059131
rpts                     -0.187111
wgbs                           0.0
lambda_alphaj             0.026377
zeta                      1.133344
A                                0
T                                0
G                                1
C                                0
dataset                      train
Name: 0, dtype: object
['ctcf' 'h4k20me1' 'h3k79me2' 'h3k4me1' 'h3k9me3' 'h3k36me3' 'sj5' 'sj3'
 'rpts' 'wgbs']
Number of Samples: 136927782
Number of Features: 10


In [12]:
cuda_available = torch.cuda.is_available()
print("CUDA (GPU support) is available:", cuda_available)
num_gpus = torch.cuda.device_count()
print("Number of GPUs available:", num_gpus)

CUDA (GPU support) is available: True
Number of GPUs available: 1


In [13]:
class GeneDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.gene_ids = dataframe['ensembl_gene_id'].unique()
        self.genes = dataframe.groupby('ensembl_gene_id')
        self.cache = {}

    def __len__(self):
        return len(self.gene_ids)

    def __getitem__(self, idx):
        gene_id = self.gene_ids[idx]
        gene = self.genes.get_group(gene_id)
                
        if gene_id in self.cache:
            return self.cache[gene_id]
 
        result = {
            'GeneId': gene_id,
            'Seq_Name': gene['seqnames'].iloc[0],
            'Start': gene['start'],
            'End': gene['end'],
            'Strand': gene['strand'],
            
            # epigenomic features per gene j, site i
            'Y_ji':  torch.tensor(gene[feature_names].values, dtype=torch.float64),
            
            # read counts per gene j, site i
            'X_ji': torch.tensor(gene['score'].values, dtype=torch.float64),
            
            # read depth * initiation rate values per gene j
            'C_j': torch.tensor(gene['lambda_alphaj'].iloc[0], dtype=torch.float64),
            
            # GLM elongation rate predictions per gene j, site i
            'Z_ji': torch.tensor(gene['zeta'].values, dtype=torch.float64),
            
            # one-hot encoded sequences
            'N_ji': torch.tensor(gene[nucleotides].values, dtype=torch.float64), 
            'Length': len(gene)
        }
    
        self.cache[gene_id] = result

        return result

In [14]:
from torch.utils.data import BatchSampler

class GeneBatchSampler(Sampler):
    def __init__(self, dataset, batch_size, bucket_size):
        self.dataset = dataset
        self.batch_size = batch_size
        self.bucket_size = bucket_size

        lengths = torch.tensor([dataset[i]['Length'] for i in range(len(dataset))])
        max_length = lengths.max().item()
        min_length = lengths.min().item()
        self.n_buckets = ((max_length - min_length) // bucket_size) + 1
        self.boundaries = torch.arange(min_length, max_length + bucket_size, step=bucket_size)
        self.bucket_indices = torch.bucketize(lengths, self.boundaries, right=True)

        # Efficient grouping of indices into buckets using PyTorch operations
        self.buckets = [torch.where(self.bucket_indices == i + 1)[0].tolist() for i in range(self.n_buckets)]

    def __iter__(self):
        for bucket in self.buckets:
            for batch in BatchSampler(torch.utils.data.SubsetRandomSampler(bucket), self.batch_size, drop_last=False):
                yield batch

    # calculate number of batches created
    def __len__(self):
        # Include partial batch in calculation
        return sum((len(bucket) + self.batch_size - 1) // self.batch_size for bucket in self.buckets)


In [15]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    GeneIds, Seq_Names, Start, End, Strand, C_j, Lengths = zip(*[(item['GeneId'], item['Seq_Name'], item['Start'], item['End'], item['Strand'], item['C_j'], item['Length']) for item in batch])
    
    Y_ji = pad_sequence([item['Y_ji'] for item in batch], batch_first=True, padding_value=0.0)
    X_ji = pad_sequence([item['X_ji'] for item in batch], batch_first=True, padding_value=0.0)
    Z_ji = pad_sequence([item['Z_ji'] for item in batch], batch_first=True, padding_value=1.0)
    N_ji = pad_sequence([item['N_ji'] for item in batch], batch_first=True, padding_value=-1)
    
    longest_seq_length = torch.arange(X_ji.size(1)).unsqueeze(0)
    seq_lengths = torch.tensor(Lengths).unsqueeze(-1) 
    mask = longest_seq_length < seq_lengths
    
    return {
        'GeneId': GeneIds,
        'Seq_Name': Seq_Names,
        'Start': Start,
        'End': End,
        'Strand': Strand,
        'Y_ji': Y_ji,
        'X_ji': X_ji,
        'C_j': torch.stack(C_j).unsqueeze(1),
        'Z_ji': Z_ji,
        'N_ji': N_ji,
        'Mask': mask,
        'Length': len(X_ji[0])
    }

In [16]:
def build_dataset(data, batch_size, bucket_size):
    dataset = GeneDataset(data)
    batch_sampler = GeneBatchSampler(dataset, batch_size, bucket_size)
    loader = DataLoader(dataset, batch_sampler=batch_sampler, num_workers=7, collate_fn=collate_fn)
    return loader

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def build_model(model_type, num_features, num_seq_features, y_hidden_layer_sizes, y_filter_size, y_dilation_rate,
                     n_hidden_layer_sizes, n_filter_size, n_dilation_rate, dropout, weight_init,
                     lstm_hidden_layer_size, num_lstm_layers=None, bidirectional=False):

    class EpLinearModel(nn.Module):
        def __init__(self, input_size):
            super(EpLinearModel, self).__init__()
            self.name = "ep_linear"
            self.linear = nn.Linear(input_size, 1, bias=False)

        def forward(self, Y_ji):
            x = self.linear(Y_ji)
            return x.squeeze(-1)   
    
    class EpSeqLinearModel(nn.Module):
        def __init__(self, input_size):
            super(EpSeqLinearModel, self).__init__()
            self.name = "ep_seq_linear"
            self.linear = nn.Linear(input_size, 1, bias=False)

        def forward(self, Y_ji, N_ji):
            x = torch.cat((Y_ji, N_ji), axis=-1)
            x = self.linear(x)
            return x.squeeze(-1)
    
    class LSTMModel(nn.Module):
        def __init__(self, input_size, hidden_layer_size, num_layers, bidirectional):
            super(LSTMModel, self).__init__()
            self.name = "lstm"
            self.lstm = nn.LSTM(input_size, hidden_layer_size, num_layers, bidirectional=bidirectional, batch_first=True)
            self.bidirectional_linear = nn.Linear(2 * hidden_layer_size, 1)
            self.linear = nn.Linear(hidden_layer_size, 1)
            self.bidirectional = bidirectional

        def forward(self, Y_ji, N_ji):
            x = torch.cat((Y_ji, N_ji), axis=-1)
            x, _ = self.lstm(x)
            if self.bidirectional:
                x = self.bidirectional_linear(x)
            else:
                x = self.linear(x)
            return x.squeeze(-1)

    class CNN(nn.Module):
        def __init__(self, num_features, num_seq_features, y_hidden_layer_sizes, y_filter_size, y_dilation_rate,
                     n_hidden_layer_sizes, n_filter_size, n_dilation_rate, dropout, 
                     lstm_hidden_layer_size, num_lstm_layers=None, bidirectional=False):
            super(CNN, self).__init__()
            self.name = "cnn"            

            self.y_convs = nn.ModuleList()
            y_in_channels = num_features
            if isinstance(y_filter_size, int):
                y_filter_size = [y_filter_size] * len(y_hidden_layer_sizes)
            elif isinstance(y_filter_size, list):
                current_length = len(y_filter_size)
                if current_length < len(y_hidden_layer_sizes):
                    # Extend filter_size by repeating its last value to match the target length
                    y_filter_size = y_filter_size + [y_filter_size[-1]] * (len(y_hidden_layer_sizes) - current_length)
            
            y_dilation = 1
            for idx, out_channels in enumerate(y_hidden_layer_sizes):
                y_padding = ((y_filter_size[idx] - 1) * y_dilation) // 2
                if y_dilation_rate > 0:
                    self.y_convs.append(
                        nn.Conv1d(y_in_channels, out_channels, y_filter_size[idx], stride=1, padding=y_padding, dilation=y_dilation)
                    )
                    y_dilation *= y_dilation_rate
                else:
                    self.y_convs.append(
                        nn.Conv1d(y_in_channels, out_channels, y_filter_size[idx], stride=1, padding='same')
                    )
                y_in_channels = out_channels
            
            
            self.n_convs = nn.ModuleList()
            n_in_channels = num_seq_features
            if isinstance(n_filter_size, int):
                n_filter_size = [n_filter_size] * len(n_hidden_layer_sizes)
            elif isinstance(n_filter_size, list):
                current_length = len(n_filter_size)
                if current_length < len(n_hidden_layer_sizes):
                    # Extend filter_size by repeating its last value to match the target length
                    n_filter_size = n_filter_size + [n_filter_size[-1]] * (len(n_hidden_layer_sizes) - current_length)
            

            n_dilation = 1
            for idx, out_channels in enumerate(n_hidden_layer_sizes):
                n_padding = ((n_filter_size[idx] - 1) * n_dilation) // 2
                if n_dilation_rate > 0:
                    self.n_convs.append(
                        nn.Conv1d(n_in_channels, out_channels, n_filter_size[idx], stride=1, padding=n_padding, dilation=n_dilation)
                    )
                    n_dilation *= n_dilation_rate
                else:
                    self.n_convs.append(
                        nn.Conv1d(n_in_channels, out_channels, n_filter_size[idx], stride=1, padding='same')
                    )
                n_in_channels = out_channels

            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(dropout)

            # Final convolutional layer to map to a single output channel
            # Since the output needs to be (batch_size, seq_len), we map the final features to 1
            self.final_conv = nn.Conv1d(y_hidden_layer_sizes[-1] + n_hidden_layer_sizes[-1], 1, 1)  # 1x1 convolution
            
            if num_lstm_layers > 0:
                self.gru = nn.GRU(input_size=y_hidden_layer_sizes[-1] + n_hidden_layer_sizes[-1], hidden_size=lstm_hidden_layer_size, num_layers=num_lstm_layers, bidirectional=bidirectional, batch_first=True)
            self.final_linear = nn.Linear(lstm_hidden_layer_size, 1)
            self.bidirectional = True
            self.final_bidirectional_linear = nn.Linear(lstm_hidden_layer_size*2, 1)
            
        def forward(self, Y_ji, N_ji):
            Y_ji = Y_ji.permute(0, 2, 1)  
            N_ji = N_ji.permute(0, 2, 1)
            
            for conv in self.y_convs:
                Y_ji = conv(Y_ji)
                Y_ji = self.relu(Y_ji)
                Y_ji = self.dropout(Y_ji)
            
            for conv in self.n_convs:
                N_ji = conv(N_ji)
                N_ji = self.relu(N_ji)
                N_ji = self.dropout(N_ji)

            x = torch.cat((Y_ji, N_ji), 1)
            
            if num_lstm_layers > 0:
                x = x.permute(0,2,1)
                x, (h_n, c_n) = self.gru(x)
                if self.bidirectional:
                    x = self.final_bidirectional_linear(x)
                else:
                    x = self.final_linear(x)
                x = x.squeeze(-1)
            
            else:
                x = self.final_conv(x)
                x = x.squeeze(1)  
                
            return x
    
    if model_type == 'lstm':
        model = LSTMModel(num_features + num_seq_features, lstm_hidden_layer_sizes, num_lstm_layers, bidirectional)
    elif model_type == 'ep_seq_linear':
        model = EpSeqLinearModel(num_features + num_seq_features)
    elif model_type == 'ep_linear':
        model = EpLinearModel(num_features)
    elif model_type == 'cnn':
        model = CNN(num_features, num_seq_features, y_hidden_layer_sizes, y_filter_size, y_dilation_rate, 
                    n_hidden_layer_sizes, n_filter_size, n_dilation_rate, dropout, lstm_hidden_layer_size,
                    num_lstm_layers, bidirectional)
    
    if cuda_available:
        if num_gpus > 1:
            print("Using", num_gpus, "GPUs")
            Dmodel = torch.nn.DataParallel(model)
        model = model.to('cuda')

    print(model)
    
    # expected weights are close to 0 which is why 0 initializing weights converges much quicker
    if weight_init == 'zero':
        with torch.no_grad():
            for param in model.parameters():
                param.zero_()
    
    model.double()

    return model.to(device)

In [18]:
def build_optimizer(network, learning_rate):
    optimizer = optim.Adam(network.parameters(),
                           lr=learning_rate)
    return optimizer

In [19]:
def valid_epoch(model, loader, loss_fn):
    model.eval()
    total_neural_net_loss = 0
    total_glm_loss = 0
    neural_net_zeta = []
    glm_zeta = []
    
    with torch.no_grad():
        for idx, batch in enumerate(loader):
            Y_ji_batch = batch['Y_ji'].to(device)
            X_ji_batch = batch['X_ji'].to(device)
            N_ji_batch = batch['N_ji'].to(device) 
            C_j_batch = batch['C_j'].to(device)
            Z_ji_batch = batch['Z_ji'].to(device)
            mask = batch['Mask'].to(device)
            
            if model.name == "ep_linear":
                outputs = model(Y_ji_batch)
            else:
                outputs = model(Y_ji_batch, N_ji_batch)
            
            neural_net_loss = loss_fn(X_ji_batch, C_j_batch, outputs, mask)
            glm_loss = loss_fn(X_ji_batch, C_j_batch, torch.log(Z_ji_batch), mask)

            total_neural_net_loss +=  neural_net_loss.item()
            total_glm_loss += glm_loss.item()
            
            # store all predictions in list
            neural_net_zeta.append(torch.exp(outputs.cpu().flatten()))
            glm_zeta.append(batch['Z_ji'].flatten())
    
    # calculate average loss across all batches
    avg_neural_net_loss = total_neural_net_loss / len(loader)
    avg_glm_loss = total_glm_loss / len(loader)
    
    neural_net_zeta = torch.cat(neural_net_zeta)
    glm_zeta = torch.cat(glm_zeta)
    
    return avg_neural_net_loss, avg_glm_loss, neural_net_zeta, glm_zeta

In [20]:
def train_epoch(model, loader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    for idx, batch in enumerate(loader):
        optimizer.zero_grad()
        Y_ji_batch = batch['Y_ji'].to(device) 
        X_ji_batch = batch['X_ji'].to(device)
        C_j_batch = batch['C_j'].to(device)
        N_ji_batch = batch['N_ji'].to(device)
        mask = batch['Mask'].to(device)
                
        if model.name == "ep_linear":
            outputs = model(Y_ji_batch)
        else:
            outputs = model(Y_ji_batch, N_ji_batch)        
            
        loss = loss_fn(X_ji_batch, C_j_batch, outputs, mask)
        loss.backward()
        optimizer.step()
        
        # calculate average loss across all batches
        total_loss += loss.item()
    avg_train_loss = total_loss / len(loader)
    
    return avg_train_loss

In [21]:
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()

    def forward(self, X_ji, C_j, rho_ji, mask):
        loss = (X_ji * rho_ji + C_j * torch.exp(-rho_ji) - X_ji * torch.log(C_j)) * mask
                
        non_padded_elements = mask.sum()
        
        avg_loss = loss.sum() / non_padded_elements
                    
        return avg_loss

In [22]:
# model configs
model_type = 'cnn'
ep_ft_configs = {
    'hidden_layer_sizes': [16],
    'filter_size': 10,
    'dilation_rate': 0,
}
seq_ft_configs = {
    'hidden_layer_sizes': [32, 64, 128],
    'filter_size': [3, 5, 10],
    'dilation_rate': 0,
}
dropout = 0.5
lstm_hidden_layer_size = 16
num_lstm_layers = 0
bidirectional = True
weight_init = None
# dataset configs
dataset_configs ={
    'batch_size': 10,
    'bucket_size': 2000,
}
# optimizer configs
learning_rate = 1e-4

In [23]:
def calculate_metrics(neural_net_zeta, glm_zeta):
    neural_net_zeta = torch.tensor(neural_net_zeta, dtype=torch.float64)
    glm_zeta = torch.tensor(glm_zeta, dtype=torch.float64)

    mse = F.mse_loss(neural_net_zeta, glm_zeta)
    mae = F.l1_loss(neural_net_zeta, glm_zeta)
    correlation_coefficient = np.corrcoef(neural_net_zeta, glm_zeta)[0, 1]

    print(f"MSE: {mse.item()}")
    print(f"MAE: {mae.item()}")
    print("Correlation Coefficient:", correlation_coefficient)

In [24]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
epochs = 20
increase_cut=0.00001
patience=5

def train():
    model = build_model(model_type, num_features, num_seq_features, ep_ft_configs['hidden_layer_sizes'], 
                        ep_ft_configs['filter_size'], ep_ft_configs['dilation_rate'],
                        seq_ft_configs['hidden_layer_sizes'], seq_ft_configs['filter_size'], 
                        seq_ft_configs['dilation_rate'], dropout, weight_init,
                        lstm_hidden_layer_size, num_lstm_layers, bidirectional)
    
    train_loader = build_dataset(train_data, batch_size=dataset_configs['batch_size'], bucket_size=dataset_configs['bucket_size'])
    valid_loader = build_dataset(valid_data, batch_size=dataset_configs['batch_size'], bucket_size=dataset_configs['bucket_size'])
    
    optimizer = build_optimizer(model, learning_rate)
    
    loss_fn = torch.jit.script(CustomLoss())
    # track loss curves
    loss_neural_net_train = [0] * epochs
    loss_neural_net_valid = [0] * epochs
    loss_glm_valid = [0] * epochs
    
    # scheduler to reduce learning rate by half when new validation loss > old validation loss
    old_neural_net_valid_loss = float('inf')
    epochs_no_improve = 0
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

    for epoch in range(epochs):
        print(f'Epoch {epoch+1}')
        
        train_loss = train_epoch(model, train_loader, optimizer, loss_fn)
        loss_neural_net_train[epoch] = train_loss
        print(f"train loss: {train_loss: .4f}")
        
        valid_neural_net_loss, valid_glm_loss, neural_net_zeta, glm_zeta = valid_epoch(model, valid_loader, loss_fn)
        loss_neural_net_valid[epoch] = valid_neural_net_loss
        loss_glm_valid[epoch] = valid_glm_loss
        print(f"valid neural net loss: {valid_neural_net_loss: .4f}")
        print(f"valid glm loss: {valid_glm_loss: .4f}")
        
        calculate_metrics(neural_net_zeta, glm_zeta)
        
        # early stopping
        if valid_neural_net_loss < old_neural_net_valid_loss - increase_cut:
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            
        if epochs_no_improve == patience:
            print("Early Stopping")
            break
        
        # reduce learning rate if new loss > old loss
        if valid_neural_net_loss > old_neural_net_valid_loss:
            optimizer.param_groups[0]['lr'] *= 0.5
            print(f"Reduced learning rate to {optimizer.param_groups[0]['lr']}")
            
        old_neural_net_valid_loss = valid_neural_net_loss
        scheduler.step(valid_neural_net_loss)
        
    return model

In [25]:
model = train()

CNN(
  (y_convs): ModuleList(
    (0): Conv1d(10, 16, kernel_size=(10,), stride=(1,), padding=same)
  )
  (n_convs): ModuleList(
    (0): Conv1d(4, 32, kernel_size=(3,), stride=(1,), padding=same)
    (1): Conv1d(32, 64, kernel_size=(5,), stride=(1,), padding=same)
    (2): Conv1d(64, 128, kernel_size=(10,), stride=(1,), padding=same)
  )
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (final_conv): Conv1d(144, 1, kernel_size=(1,), stride=(1,))
  (final_linear): Linear(in_features=16, out_features=1, bias=True)
  (final_bidirectional_linear): Linear(in_features=32, out_features=1, bias=True)
)
Epoch 1


  self.padding, self.dilation, self.groups)


train loss:  0.1336
valid neural net loss:  0.1183
valid glm loss:  0.1282


  
  This is separate from the ipykernel package so we can avoid doing imports until


MSE: 1.37634101431385
MAE: 0.8767782800164573
Correlation Coefficient: 0.1301996272875648
Epoch 2
train loss:  0.1277
valid neural net loss:  0.1185
valid glm loss:  0.1291
MSE: 2.776417329433728
MAE: 1.0229304164468431
Correlation Coefficient: 0.1703932759115013
Reduced learning rate to 5e-05
Epoch 3


KeyboardInterrupt: 

In [None]:
# save model parameters

from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"models/LSTM_Model.pth"
torch.save(model.state_dict(), filename)

In [None]:
# load model state

model = build_model(model_type, num_lstm_layers, bidirectional, hidden_layer_sizes, activation_func,
                filter_size, pool_type, pool_size, dropout, window_size, weight_init)

model.load_state_dict(torch.load("models/LSTM_Model.pth", map_location=torch.device('cpu')))

cuda_available = torch.cuda.is_available()
print("CUDA (GPU support) is available:", cuda_available)
num_gpus = torch.cuda.device_count()
print("Number of GPUs available:", num_gpus)
if cuda_available:
    if num_gpus > 1:
        print("Using", num_gpus, "GPUs")
        model = torch.nn.DataParallel(model)
    model = model.to('cuda')

first_param_device = next(model.parameters()).device
print("Model is on device:", first_param_device)

model.double()

In [None]:
weights = model.linear.weight.data.cpu().numpy()
combined = ', '.join([f'"{s}": {f}' for s, f in zip(feature_names, weights[0])])
print(combined)

In [None]:
glm_kappa = [-0.0224536145637661, -0.094592589, -0.023815382, 0.030402922, -0.067234092, -0.032196914, -0.040911478, -0.018557168, -0.033545905, -0.051103287, -0.204434712, 0.015831043]

In [None]:
"""
GLM K

* ctcf: -0.02
* h3k36me3: -0.09
* h3k4me1: -0.02
* h3k79me2: +0.03
* h3k9me1: -0.06
* h3k9me3: -0.03
* h4k20me1: -0.04
* sj5: -0.02
* sj3: -0.03
* dms->stem-loop: -0.05
* rpts->low-complex: +0.01
* wgbs->DNAm: -0.2
"""

In [None]:
# plot loss curve

epochs = range(1, len(loss_neural_net_train) + 1)
plt.plot(epochs, loss_train, label='train_neural_net_loss')
plt.plot(epochs, loss_neural_net_valid, label='valid_neural_net_loss')
plt.plot(epochs, loss_glm_valid, label='valid_glm_loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show 

In [None]:
def plot_data(glm_zeta, net_zeta):
    indices = range(len(glm_zeta))
    
    fig, ax = plt.subplots(figsize=(10, 5))
    
    ax.scatter(indices, net_zeta, color='blue', label='Neural Net Zeta', s=10, alpha=0.5)
    ax.scatter(indices, glm_zeta, color='orange', label='GLM Zeta', s=10, alpha=0.5)
    
    ax.set_title('Neural Net vs GLM Elongation Rate')
    ax.set_xlabel('Index')
    ax.set_ylabel('Elongation Rate')
    ax.legend()
    
    plt.ylim(0.5, 1.5)

    plt.show()


In [None]:
model.eval()

tstdl = build_dataset(test_data, use_sliding_window, window_size)
data_iter = iter(tstdl)

In [None]:
# compute metrics for test dataset

loss_fn = CustomLoss()

total_net_loss = 0
total_glm_loss = 0
with torch.no_grad():
    for batch in tstdl:
        y_inputs = batch['Y_ji'].to(device)
        X_ji = batch['X_ji'].to(device)
        C_j = batch['C_j'].to(device)
        lengths = batch['gene_length'].to(device)
        Z_ji = batch['Z_ji'].to(device)
        rho_ji = model(y_inputs)
        net_loss = loss_fn(X_ji, C_j, rho_ji.squeeze(2), lengths)
        glm_loss = loss_fn(X_ji, C_j, torch.log(Z_ji), lengths)
        
        total_net_loss += net_loss.item()
        total_glm_loss += glm_loss.item()



print(f"Neural Net Loss: {total_net_loss/len(tstdl):.4f}")
print(f"GLM Loss: {total_glm_loss/len(tstdl):.4f}")

In [None]:
# plot for subset of genes in test dataset
for i in range(0, 4):
    inputs = next(data_iter) 
    print("number of samples: " + str(len(inputs)))

    with torch.no_grad():
        y_inputs = inputs['Y_ji'].to(device)
        rho_ji = model(y_inputs)

    glm_zeta = inputs['Z_ji'][0]
    # convert log(Z) outputs to Z
    net_zeta = torch.exp(rho_ji.cpu().squeeze())
    
    plot_data(glm_zeta, net_zeta)

In [None]:
# compute metrics for test dataset

net_zeta = []
glm_zeta = []
with torch.no_grad():
    for batch in tstdl:
        y_inputs = batch['Y_ji'].to(device)
        rho_ji = model(y_inputs)
        # convert log(Z) outputs to Z
        net_zeta.append(torch.exp(rho_ji.cpu()[0]))
        glm_zeta.append(batch['Z_ji'][0])

net_zeta = torch.cat(net_zeta, dim=0)
glm_zeta = torch.cat(glm_zeta, dim=0)
mae = F.l1_loss(net_zeta.squeeze(), glm_zeta)
mse = F.mse_loss(net_zeta.squeeze(), glm_zeta)

correlation_coefficient = np.corrcoef(glm_zeta, net_zeta.squeeze())[0, 1]
print("Correlation Coefficient:", correlation_coefficient)

print(f"Mean Absolute Error: {mae.item():.4f}")
print(f"Mean Squared Error: {mse.item():.4f}")

In [None]:
def density_plot(glm_zeta, net_zeta, gene_id):
    sns.kdeplot(x=glm_zeta, y=net_zeta, fill=True, cmap="Blues")
            
    plt.xlim([min(glm_zeta), max(glm_zeta)])
    plt.ylim([min(net_zeta), max(net_zeta)])


    plt.xlabel('GLM Elongation Rate')
    plt.ylabel('Neural Net Elongation Rate')
    plt.title(gene_id)
    plt.show()

In [None]:
# plot for all genes in test dataset

total_loss = 0
loss_fn = CustomLoss()
for batch in tstdl:
    gene_id = batch['GeneId'][0]
    model.eval()

    with torch.no_grad():
        y_inputs = batch['Y_ji'].to(device)
        rho_ji = model(y_inputs)
        
    glm_zeta = batch['Z_ji']#[0]
    # convert log(Z) outputs to Z
    net_zeta = torch.exp(rho_ji.cpu().squeeze())
        
    density_plot(glm_zeta.flatten(), net_zeta.flatten(), gene_id)
                
    plot_data(glm_zeta.flatten(), net_zeta.flatten())

In [None]:
# plot scatterplot of neural net weights and glm weights

plt.figure(figsize=(10, 10))

sns.scatterplot(x=glm_kappa, y=weights[0])

for i in range(len(glm_kappa)):
    plt.text(glm_kappa[i], weights[0][i], feature_names[i], fontsize=13, ha='right', va='top')
plt.xlabel('GLM Weights')
plt.ylabel('Neural Net Weights')

max_val = max(np.max(glm_kappa), np.max(weights[0])) + 0.04
min_val = min(np.min(glm_kappa), np.min(weights[0])) - 0.04

plt.xlim(max_val, min_val)
plt.ylim(max_val, min_val)

# Show the plot
plt.show()