In [4]:
import numpy as np
import pandas as pd
import torch
import torch.utils.data as td
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import matplotlib.colors as colors

froot = './data/k562'

df = pd.read_csv(froot + ".csv")#, chunksize=100000)

In [8]:
print(df.head())

   seqnames    start      end strand  ensembl_gene_id  score      ctcf  \
0         1  3859709  3859709      +  ENSG00000169598    0.0 -0.010876   
1         1  3859710  3859710      +  ENSG00000169598    0.0 -0.010887   
2         1  3859711  3859711      +  ENSG00000169598    0.0 -0.010902   
3         1  3859712  3859712      +  ENSG00000169598    0.0 -0.010920   
4         1  3859713  3859713      +  ENSG00000169598    0.0 -0.010941   

   h3k36me3   h3k4me1  h3k79me2  ...       sj5       sj3       dms      rpts  \
0  0.353765 -0.078256 -0.156547  ... -0.028916 -0.057178 -0.307549  0.249626   
1  0.347003 -0.077117 -0.155891  ... -0.028916 -0.057178 -0.307549  0.249626   
2  0.340295 -0.075994 -0.155236  ... -0.028916 -0.057178 -0.307549  0.249626   
3  0.333641 -0.074887 -0.154583  ... -0.028916 -0.057178 -0.307549  0.249626   
4  0.327043 -0.073795 -0.153930  ... -0.028916 -0.057178 -0.307549  0.249626   

   wgbs      A         T         G         C  lambda_alphaj  
0   0.0 -0.6

In [3]:
df['gene_length'] = df.groupby('ensembl_gene_id')['ensembl_gene_id'].transform('count')

In [4]:
filtered_df = df[df['gene_length'] == 2000]

In [16]:
column_names = filtered_df.columns.tolist()
feature_names = column_names[6:-6]
nucleotides = column_names[-6:-2]
print(feature_names)
print(nucleotides)

# process read counts
X_ji = filtered_df.groupby('ensembl_gene_id')['score'].apply(list).tolist() 

num_samples = len(X_ji)

['ctcf', 'h3k36me3', 'h3k4me1', 'h3k79me2', 'h3k9me1', 'h3k9me3', 'h4k20me1', 'sj5', 'sj3', 'dms', 'rpts', 'wgbs']
['A', 'T', 'G', 'C']


In [68]:
#Y_ji is a list of samples containing lists of their feature values
    # [   
    #   [[sample_1_feature_1], [sample_1_feature_2], [sample_1_feature_3]],
    #   [[sample_2_feature_1], [sample_1_feature_2], [sample_1_feature_3]],  
    # ]

Y_ji = []

for sample_id in filtered_df['ensembl_gene_id'].unique():
    sample_data = [filtered_df[feature_name][filtered_df['ensembl_gene_id'] == sample_id].tolist() for feature_name in feature_names]
    Y_ji.append(sample_data)

In [69]:
C_j = filtered_df.groupby('ensembl_gene_id')['lambda_alphaj'].apply(list).tolist() 

In [70]:
from torch.utils.data import Dataset, DataLoader, TensorDataset

class CustomDataset(Dataset):
    def __init__(self, Y_ji, X_ji, C_j):
        self.Y_ji = Y_ji
        self.X_ji = X_ji
        self.C_j = C_j

    def __len__(self):
        return len(self.X_ji)

    def __getitem__(self, idx):
        return {
            'Y_ji':  torch.tensor(self.Y_ji[idx], dtype=torch.float32),
            'X_ji': torch.tensor(self.X_ji[idx], dtype=torch.float32),
            'C_j': torch.tensor(self.C_j[idx], dtype=torch.float32)
        }

In [71]:
dataset = CustomDataset(Y_ji, X_ji, C_j)

trnset, valset, tstset = td.random_split(dataset, [0.5,0.25,0.25])

trndl = DataLoader(trnset, batch_size=64, shuffle=True)
tstdl = DataLoader(tstset, batch_size=64, shuffle=True)
valdl = DataLoader(valset, batch_size=64, shuffle=True)

In [75]:
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Model, self).__init__()
        
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, inputs):
        outputs, _ = self.lstm(inputs)
        averaged_outputs = torch.mean(outputs, dim=1)
        predictions = self.fc(averaged_outputs)
        return predictions

# input size: [50, 12, 2000]
input_size = 2000
hidden_size = 32
output_size = 2000

model = Model(input_size, hidden_size, output_size)
print(model)

Model(
  (lstm): LSTM(2000, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=2000, bias=True)
)


In [78]:
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()

    def forward(self, X_ji, C_j, Z_ji):
        loss = X_ji * Z_ji + C_j * torch.exp(-Z_ji)
        # compute mean over batch to normalize due to varying batch sizes
        return loss.mean() 

In [79]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100

loss_fn = CustomLoss()

for epoch in range(num_epochs):
    for batch in trndl:
        optimizer.zero_grad()
        #inputs = batch
        Y_ji_batch = batch['Y_ji']
        X_ji_batch = batch['X_ji']
        C_j_batch = batch['C_j']
        #lengths = [len(sample) for sample in inputs]
        #padded_inputs = pad_sequence([torch.tensor(sample) for sample in inputs], batch_first=True)
        outputs = model(Y_ji_batch)#, lengths)
        loss = loss_fn(X_ji_batch, C_j_batch, outputs)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

Epoch [1/100], Loss: 0.025821063667535782
Epoch [2/100], Loss: 0.025485757738351822
Epoch [3/100], Loss: 0.025029176846146584
Epoch [4/100], Loss: 0.02463790588080883
Epoch [5/100], Loss: 0.02422621101140976
Epoch [6/100], Loss: 0.023814359679818153
Epoch [7/100], Loss: 0.023404225707054138
Epoch [8/100], Loss: 0.023000115528702736
Epoch [9/100], Loss: 0.022585561498999596
Epoch [10/100], Loss: 0.02217487245798111
Epoch [11/100], Loss: 0.02176642045378685
Epoch [12/100], Loss: 0.021358998492360115
Epoch [13/100], Loss: 0.020942049100995064
Epoch [14/100], Loss: 0.020534981042146683
Epoch [15/100], Loss: 0.02012908272445202
Epoch [16/100], Loss: 0.01972346194088459
Epoch [17/100], Loss: 0.019314488396048546
Epoch [18/100], Loss: 0.018901772797107697
Epoch [19/100], Loss: 0.018491512164473534
Epoch [20/100], Loss: 0.01808159612119198
Epoch [21/100], Loss: 0.017674952745437622
Epoch [22/100], Loss: 0.017270954325795174
Epoch [23/100], Loss: 0.016866274178028107
Epoch [24/100], Loss: 0.016

In [80]:
from matplotlib import pyplot as plt
epochs = range(1, len(hist[0]) + 1)
plt.plot(epochs, hist[0], label='train_loss')
plt.plot(epochs, hist[1], label='valid_loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show 

NameError: name 'hist' is not defined

In [171]:
mae = []
mse = []
for inputs, labels in tstdl:
    #interpret_model(inputs, labels)
    outputs = model(inputs)
    mae.append(torch.mean(torch.abs(outputs - labels), dim=0))
    mse.append(torch.mean((outputs - labels)**2, dim=0))

mean_mae = torch.mean(torch.stack(mae))    
mean_mse = torch.mean(torch.stack(mse))
print("Overall Mean Absolute Error (MAE):", round(mean_mae.item(), 3))
print("Overall Mean Squared Error (MSE):", round(mean_mse.item(), 3))

Overall Mean Absolute Error (MAE): 0.019
Overall Mean Squared Error (MSE): 0.002


In [82]:
model.eval()

inputs = next(iter(tstdl)) 
print("number of samples: " + str(len(inputs)))

with torch.no_grad():
    outputs = model(inputs['Y_ji'])
    
for output in outputs:
    for inner_val in output:
        if inner_val > 0.25:
            print(inner_val)

number of samples: 3
tensor(0.5584)
tensor(0.3837)
tensor(1.0836)
tensor(0.3455)
tensor(1.2392)
tensor(1.3129)
tensor(0.6031)
tensor(0.2662)
tensor(1.3341)
tensor(0.6302)
tensor(0.5611)
tensor(1.8666)
tensor(0.4805)
tensor(1.2466)
tensor(0.9446)
tensor(0.4119)
tensor(0.4487)
tensor(1.0583)
tensor(0.7114)
tensor(0.6526)
tensor(0.2983)
tensor(0.4958)
tensor(1.6123)
tensor(0.7998)
tensor(1.7006)
tensor(0.3430)
tensor(0.2630)
tensor(1.5902)
tensor(0.4910)
tensor(0.2744)
tensor(1.2938)
tensor(0.9765)
tensor(0.4141)
tensor(0.4132)
tensor(0.6443)
tensor(0.3496)
tensor(0.5272)
tensor(0.5503)
tensor(1.0562)
tensor(0.3340)
tensor(0.8524)
tensor(1.2813)
tensor(0.6432)
tensor(0.4177)
tensor(0.7683)
tensor(1.1626)
tensor(0.3560)
tensor(1.0314)
tensor(0.9585)
tensor(0.9804)
tensor(0.6298)
tensor(0.6283)
tensor(1.7421)
tensor(0.4417)
tensor(1.3468)
tensor(0.9194)
tensor(0.7258)
tensor(0.7520)
tensor(0.3022)
tensor(0.9499)
tensor(1.2772)
tensor(1.1883)
tensor(0.9157)
tensor(0.7766)
tensor(0.4010)
tens

In [None]:
indices = range(len(targets[0]))

fig, axs = plt.subplots(8, 4, figsize=(20,15))
for i in range(8):
    for j in range(4):
        axs[i, j].scatter(indices, targets[i+j*8], s=5)
        axs[i, j].scatter(indices, outputs[i+j*8], s=5)
        axs[i, j].set_ylim(-0.25, 2.25)

plt.xlabel('Index')
plt.ylabel('Read Count')
plt.legend(['True Read Count', 'Predicted Read Count'], loc='upper center', bbox_to_anchor=(0.5, -0.6))