In [14]:
import numpy as np
import pandas as pd
import torch
import torch.utils.data as td
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import matplotlib.colors as colors

froot = './data/k562_main'

df = pd.read_csv(froot + ".csv")#, chunksize=100000)

In [15]:
print(df.head())

   seqnames    start      end strand  ensembl_gene_id  score      ctcf  \
0         1  3859709  3859709      +  ENSG00000169598    0.0 -0.010876   
1         1  3859710  3859710      +  ENSG00000169598    0.0 -0.010887   
2         1  3859711  3859711      +  ENSG00000169598    0.0 -0.010902   
3         1  3859712  3859712      +  ENSG00000169598    0.0 -0.010920   
4         1  3859713  3859713      +  ENSG00000169598    0.0 -0.010941   

   h3k36me3   h3k4me1  h3k79me2  ...       sj5       sj3       dms      rpts  \
0  0.353765 -0.078256 -0.156547  ... -0.028916 -0.057178 -0.307549  0.249626   
1  0.347003 -0.077117 -0.155891  ... -0.028916 -0.057178 -0.307549  0.249626   
2  0.340295 -0.075994 -0.155236  ... -0.028916 -0.057178 -0.307549  0.249626   
3  0.333641 -0.074887 -0.154583  ... -0.028916 -0.057178 -0.307549  0.249626   
4  0.327043 -0.073795 -0.153930  ... -0.028916 -0.057178 -0.307549  0.249626   

   wgbs      A         T         G         C  lambda_alphaj  
0   0.0 -0.6

In [16]:
df['gene_length'] = df.groupby('ensembl_gene_id')['ensembl_gene_id'].transform('count')

In [17]:
filtered_df = df[df['gene_length'] == 2000]

In [11]:
bucket_boundaries = list(range(0, 38000, 2000))
labels = bucket_boundaries[1:]


df['bucket'] = pd.cut(df['gene_length'], bins=bucket_boundaries, labels=labels, right=False)
print(df['bucket'])

df.dropna(subset=['bucket'], inplace=True)

0           20000
1           20000
2           20000
3           20000
4           20000
            ...  
61709392      NaN
61709393      NaN
61709394      NaN
61709395      NaN
61709396      NaN
Name: bucket, Length: 61709397, dtype: category
Categories (18, int64): [2000 < 4000 < 6000 < 8000 ... 30000 < 32000 < 34000 < 36000]


In [13]:
# How many values are in each bucket?
bucket_counts = df['bucket'].value_counts()

print(bucket_counts)

# total values
print(len(df))

32000    1180951
20000    1056439
18000    1026786
30000     963932
36000     963478
22000     932440
28000     895678
24000     870659
26000     859178
14000     826929
12000     820942
16000     779045
34000     711680
10000     684799
8000      530528
6000      435971
4000      205804
2000       12348
Name: bucket, dtype: int64
13757587


In [67]:
column_names = filtered_df.columns.tolist()
feature_names = column_names[6:-5]
nucleotides = column_names[-5:-1]
print(feature_names)
print(nucleotides)

# process read counts
X_ji = filtered_df.groupby('ensembl_gene_id')['score'].apply(list).tolist() 

num_samples = len(X_ji)

features = []
#for feature_name in feature_names:
    #features.append(df.groupby(ensembl_gene_id)[feature_name])

['ctcf', 'h3k36me3', 'h3k4me1', 'h3k79me2', 'h3k9me1', 'h3k9me3', 'h4k20me1', 'sj5', 'sj3', 'dms', 'rpts', 'wgbs', 'A']
['T', 'G', 'C', 'lambda_alphaj']


In [70]:
C_j = filtered_df.groupby('ensembl_gene_id')['lambda_alphaj'].apply(list).tolist() 

In [71]:
from torch.utils.data import Dataset, DataLoader, TensorDataset

class CustomDataset(Dataset):
    def __init__(self, X_ji, C_j):
        self.X_ji = X_ji
        self.C_j = C_j

    def __len__(self):
        return len(self.X_ji)

    def __getitem__(self, idx):
        return {
            'X_ji': torch.tensor(self.X_ji[idx], dtype=torch.float32),
            'C_j': torch.tensor(self.C_j[idx], dtype=torch.float32)
        }

In [72]:
dataset = CustomDataset(X_ji, C_j)

trnset, valset, tstset = td.random_split(dataset, [0.5,0.25,0.25])

trndl = DataLoader(trnset, batch_size=64, shuffle=True)
tstdl = DataLoader(tstset, batch_size=64, shuffle=True)
valdl = DataLoader(valset, batch_size=64, shuffle=True)

In [73]:
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Model, self).__init__()
        
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, inputs):#, lengths):
        #packed_inputs = nn.utils.rnn.pack_padded_sequence(inputs, lengths, batch_first=True, enforce_sorted=False)
        outputs, _ = self.lstm(inputs)
        #outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
        predictions = self.fc(outputs)
        return predictions
        
input_size = 2000#num_samples
hidden_size = 32
output_size = 2000#num_samples

model = Model(input_size, hidden_size, output_size)
# check model
print(model)

#x = torch.ones((64,2000))
#print(model(x).shape)
#nparm = sum(p.numel() for p in model.parameters() if p.requires_grad)
#print("Number of parameters: " + str(nparm))

Model(
  (lstm): LSTM(2000, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=2000, bias=True)
)


In [76]:
class CustomLoss(nn.Module):
    def __init__(self):
        super(CustomLoss, self).__init__()

    def forward(self, X_ji, C_j, Z_ji):
        loss = X_ji * Z_ji + C_j * torch.exp(-Z_ji)
        # compute mean over batch to normalize due to varying batch sizes
        return loss.mean() 

In [77]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100

loss_fn = CustomLoss()

for epoch in range(num_epochs):
    for batch in trndl:
        optimizer.zero_grad()
        #inputs = batch
        X_ji_batch = batch['X_ji']
        C_j_batch = batch['C_j']
        #lengths = [len(sample) for sample in inputs]
        #padded_inputs = pad_sequence([torch.tensor(sample) for sample in inputs], batch_first=True)
        outputs = model(X_ji_batch)#, lengths)
        loss = loss_fn(X_ji_batch, C_j_batch, outputs)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

Epoch [1/100], Loss: 0.007333577144891024
Epoch [2/100], Loss: 0.007033327128738165
Epoch [3/100], Loss: 0.0061981105245649815
Epoch [4/100], Loss: 0.005685880314558744
Epoch [5/100], Loss: 0.004781608935445547
Epoch [6/100], Loss: 0.004242316819727421
Epoch [7/100], Loss: 0.003812007373198867
Epoch [8/100], Loss: 0.0036222832277417183
Epoch [9/100], Loss: 0.0026076403446495533
Epoch [10/100], Loss: 0.0025629589799791574
Epoch [11/100], Loss: 0.0020957428496330976
Epoch [12/100], Loss: 0.00139122165273875
Epoch [13/100], Loss: 0.0005929270992055535
Epoch [14/100], Loss: 0.0005200934829190373
Epoch [15/100], Loss: -0.00042709585977718234
Epoch [16/100], Loss: -0.000947242951951921
Epoch [17/100], Loss: -0.0011818311177194118
Epoch [18/100], Loss: -0.0019143166719004512
Epoch [19/100], Loss: -0.0026030102744698524
Epoch [20/100], Loss: -0.0026214446406811476
Epoch [21/100], Loss: -0.0032371636480093002
Epoch [22/100], Loss: -0.0036363110411912203
Epoch [23/100], Loss: -0.0043199611827731

In [51]:
from matplotlib import pyplot as plt
epochs = range(1, len(hist[0]) + 1)
plt.plot(epochs, hist[0], label='train_loss')
plt.plot(epochs, hist[1], label='valid_loss')

plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show 

NameError: name 'hist' is not defined

In [171]:
mae = []
mse = []
for inputs, labels in tstdl:
    #interpret_model(inputs, labels)
    outputs = model(inputs)
    mae.append(torch.mean(torch.abs(outputs - labels), dim=0))
    mse.append(torch.mean((outputs - labels)**2, dim=0))

mean_mae = torch.mean(torch.stack(mae))    
mean_mse = torch.mean(torch.stack(mse))
print("Overall Mean Absolute Error (MAE):", round(mean_mae.item(), 3))
print("Overall Mean Squared Error (MSE):", round(mean_mse.item(), 3))

Overall Mean Absolute Error (MAE): 0.019
Overall Mean Squared Error (MSE): 0.002


In [60]:
model.eval()

inputs = next(iter(tstdl)) 
print("number of samples: " + str(len(inputs)))

with torch.no_grad():
    outputs = model(inputs['X_ji'])
    
for output in outputs:
    for inner_val in output:
        if inner_val > 0.25:
            print(inner_val)

number of samples: 2
tensor(0.3920)
tensor(0.3050)
tensor(0.2662)
tensor(0.2966)
tensor(0.2516)
tensor(0.2622)
tensor(0.3023)
tensor(0.2802)
tensor(0.2806)
tensor(0.2848)
tensor(0.2579)
tensor(0.3045)
tensor(0.2646)
tensor(0.3164)
tensor(0.2872)
tensor(0.3833)
tensor(0.2627)
tensor(0.2855)
tensor(0.2743)
tensor(0.2628)
tensor(0.2645)
tensor(0.2718)
tensor(0.2915)
tensor(0.4143)
tensor(0.3008)
tensor(0.3436)
tensor(0.2551)
tensor(0.3803)
tensor(0.2501)
tensor(0.3127)
tensor(0.3424)
tensor(0.2765)
tensor(0.2524)
tensor(0.2663)
tensor(0.2939)
tensor(0.2765)
tensor(0.2642)
tensor(0.3266)
tensor(0.2568)
tensor(0.3434)
tensor(0.2668)
tensor(0.2947)
tensor(0.2507)
tensor(0.2727)
tensor(0.3401)
tensor(0.2661)
tensor(0.2554)
tensor(0.2852)
tensor(0.2720)
tensor(0.3244)
tensor(0.2519)
tensor(0.2722)
tensor(0.2776)
tensor(0.2670)
tensor(0.2678)
tensor(0.3680)
tensor(0.2944)
tensor(0.2704)
tensor(0.3550)
tensor(0.2833)
tensor(0.2954)
tensor(0.2583)
tensor(0.3508)
tensor(0.3322)
tensor(0.3420)
tens

In [None]:
indices = range(len(targets[0]))

fig, axs = plt.subplots(8, 4, figsize=(20,15))
for i in range(8):
    for j in range(4):
        axs[i, j].scatter(indices, targets[i+j*8], s=5)
        axs[i, j].scatter(indices, outputs[i+j*8], s=5)
        axs[i, j].set_ylim(-0.25, 2.25)

plt.xlabel('Index')
plt.ylabel('Read Count')
plt.legend(['True Read Count', 'Predicted Read Count'], loc='upper center', bbox_to_anchor=(0.5, -0.6))