In [1]:
import torch
import pandas as pd
#import tqdm
import numpy as np
import torch

In [2]:
with open('data/fragments2.tsv', 'r') as file:
    for i in range(1000):
        print(file.readline())

# id=ccRCC_CPT0078510004

# description=

#

# pipeline_name=cellranger-atac

# pipeline_version=cellranger-atac-2.0.0

#

# reference_path=/diskmnt/Datasets/Reference/Cellranger-ARC/refdata-cellranger-arc-GRCh38-2020-A-2.0.0

# reference_fasta_hash=b6f131840f9f337e7b858c3d1e89d7ce0321b243

# reference_gtf_hash=3b4c36ca3bade222a5b53394e8c07a18db7ebb11

# reference_version=2020-A

# mkref_version=cellranger-arc-2.0.0

#

# primary_contig=chr1

# primary_contig=chr10

# primary_contig=chr11

# primary_contig=chr12

# primary_contig=chr13

# primary_contig=chr14

# primary_contig=chr15

# primary_contig=chr16

# primary_contig=chr17

# primary_contig=chr18

# primary_contig=chr19

# primary_contig=chr2

# primary_contig=chr20

# primary_contig=chr21

# primary_contig=chr22

# primary_contig=chr3

# primary_contig=chr4

# primary_contig=chr5

# primary_contig=chr6

# primary_contig=chr7

# primary_contig=chr8

# primary_contig=chr9

# primary_contig=chrX

# primary_contig=chrY

# primary_c

In [3]:
#df = pd.read_csv('data/fragments1.tsv', sep='\t', skiprows=51)
df = pd.read_csv('data/fragments2.tsv', sep='\t', skiprows=51, header=None)

df.columns = ['Chromosome', 'Start', 'End', 'Barcode', 'Count']

In [4]:
df
df_subset = df.head(100000)

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

def create_windows(data, window_size, step_size):
    windows = []
    for chrom in data['Chromosome'].unique():
        chrom_data = data[data['Chromosome'] == chrom]
        start_positions = np.arange(chrom_data['Start'].min(), chrom_data['End'].max(), step_size)
        for start in start_positions:
            end = start + window_size
            window_data = chrom_data[(chrom_data['Start'] >= start) & (chrom_data['End'] <= end)]
            if not window_data.empty:
                windows.append({
                    'Chromosome': chrom,
                    'Start': start,
                    'End': end,
                    'Total_Count': window_data['Count'].sum()
                })
    return pd.DataFrame(windows)

WINDOW_SIZE = 1000
STEP_SIZE = 500

windows = create_windows(df_subset, WINDOW_SIZE, STEP_SIZE)

scaler = MinMaxScaler()
windows[['Start', 'End', 'Total_Count']] = scaler.fit_transform(windows[['Start', 'End', 'Total_Count']])

threshold = windows['Total_Count'].mean() + 2 * windows['Total_Count'].std()
windows['Tumor_Prone'] = (windows['Total_Count'] > threshold).astype(int)

X = windows[['Start', 'End', 'Total_Count']].values 
y = windows['Tumor_Prone'].values          

In [6]:
from torch.utils.data import TensorDataset, DataLoader, random_split, Dataset
from torch.cuda.amp import GradScaler, autocast
import torch.nn as nn
import torch.optim as optim
import copy

class ChromosomeDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

dataset = ChromosomeDataset(X, y)

train_size = int(0.6 * len(dataset))
val_size = int(0.3 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size,test_size])

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [7]:
import torch.nn as nn
import torch.nn.functional as F

class ChromosomeCNN(nn.Module):
    def __init__(self):
        super(ChromosomeCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=2, stride=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=2, stride=1)
        self.fc1 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, 1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = torch.flatten(x, start_dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x))  # Sigmoid for binary classification
        return x

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ChromosomeCNN().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 20
best_val_loss = float('inf')
train_losses_avg = []
val_losses_avg = []

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
#        for k, v in X_batch.items():
#        X_batch[k] = v.to(device, non_blocking=True)
        X_batch = X_batch.unsqueeze(1).to(device, non_blocking=True)
        y_batch = y_batch.unsqueeze(1).to(device, non_blocking=True)
#        X_batch, y_batch = X_batch.to(device).unsqueeze(1), y_batch.to(device).unsqueeze(1)

        optimizer.zero_grad()

        with autocast():   
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")
    
    model.eval()
    val_losses = []
    for X_batch, y_batch in val_loader:
#        for k, v in X_batch.items():
#        X_batch[k] = v.to(device, non_blocking=True)
        X_batch = X_batch.unsqueeze(1).to(device, non_blocking=True)
        y_batch = y_batch.unsqueeze(1).to(device, non_blocking=True)

        with torch.no_grad(), autocast():
            y_pred = model(X_batch)
            lossV = criterion(y_pred, y_batch)
            #mse = count_mse_loss(y_pred, y_batch.unsqueeze(1))
            val_losses.append(lossV.item())

    avg_val_loss = sum(val_losses) / len(val_losses)
    val_losses_avg.append(avg_val_loss)
    print(f'Epoch {epoch+1}, Val loss: {avg_val_loss}')
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = copy.deepcopy(model.state_dict())

torch.save({
    'model_state_dict': best_model,
}, 'best_peptide_model_2.pth')




Epoch 1/20, Loss: 0.2631
Epoch 1, Val loss: 0.13054334876998777
Epoch 2/20, Loss: 0.1355
Epoch 2, Val loss: 0.12769501494063484
Epoch 3/20, Loss: 0.1226
Epoch 3, Val loss: 0.11159945880213092
Epoch 4/20, Loss: 0.1047
Epoch 4, Val loss: 0.0889187480112718
Epoch 5/20, Loss: 0.0794
Epoch 5, Val loss: 0.06292268552727276
Epoch 6/20, Loss: 0.0437
Epoch 6, Val loss: 0.03314266125521352
Epoch 7/20, Loss: 0.0230
Epoch 7, Val loss: 0.016270032790940133
Epoch 8/20, Loss: 0.0146
Epoch 8, Val loss: 0.011891828023738438
Epoch 9/20, Loss: 0.0170
Epoch 9, Val loss: 0.013076753836245306
Epoch 10/20, Loss: 0.0088
Epoch 10, Val loss: 0.01133160077029794
Epoch 11/20, Loss: 0.0106
Epoch 11, Val loss: 0.011577513877249833
Epoch 12/20, Loss: 0.0086
Epoch 12, Val loss: 0.017323617365623396
Epoch 13/20, Loss: 0.0071
Epoch 13, Val loss: 0.006416248609903343
Epoch 14/20, Loss: 0.0077
Epoch 14, Val loss: 0.02069770559131528
Epoch 15/20, Loss: 0.0060
Epoch 15, Val loss: 0.009576465667416942
Epoch 16/20, Loss: 0.0

In [9]:
#model.eval()
#correct = 0
#total = 0
#with torch.no_grad():
#    for X_batch, y_batch in test_loader:
#        X_batch, y_batch = X_batch.to(device).unsqueeze(1), y_batch.to(device).unsqueeze(1)
#        outputs = model(X_batch)
#        predictions = (outputs > 0.5).float()
#        correct += (predictions == y_batch).sum().item()
#        total += y_batch.size(0)

#accuracy = correct / total
#print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [12]:
model.load_state_dict(best_model)
model.eval()
test_losses = []
y_preds = []
y_actuals = []

scaler = GradScaler()

for X_batch, y_batch in test_loader:
#        for k, v in X_batch.items():
#        X_batch[k] = v.to(device, non_blocking=True)
    X_batch = X_batch.unsqueeze(1).to(device, non_blocking=True)
    y_batch = y_batch.unsqueeze(1).to(device, non_blocking=True)
    
    with torch.no_grad(), autocast():
        y_pred = model(X_batch)
        lossV = criterion(y_pred, y_batch)
        
        y_preds.extend(y_pred.cpu().numpy())
        y_actuals.extend(y_batch.cpu().numpy())
        test_losses.append(lossV.item())

avg_test_loss = sum(test_losses) / len(test_losses)
print(f'Test MSE: {avg_test_loss}')

#loss_original = avg_test_loss * (max_y - min_y)**2  ## I finish later

#print(max_y, min_y)
#print("The loss on the original scale is:", loss_original)

Test MSE: 0.002550083072822996


In [13]:
def model_summary(model):
    print("Model Summary:")
    print("{:<50} {:<30} {:<15} {:<15}".format("Layer Name", "Shape", "Parameters", "Trainable"))
    print("-" * 110)
    total_params = 0
    total_trainable_params = 0
    lm_params = 0
    lm_trainable_params = 0
    lm_layers = 0
    for name, parameter in model.named_parameters():
        param = parameter.numel()
        total_params += param
        # Check if the parameter is trainable
        trainable = parameter.requires_grad
        trainable_param = param if trainable else 0
        total_trainable_params += trainable_param
        print("{:<50} {:<30} {:<15} {:<15}".format(name, str(parameter.size()), param, trainable_param))
    print("-" * 110)
    print(f"Total Parameters: {total_params}")
    print(f"Trainable Parameters: {total_trainable_params}")

model_summary(model)

Model Summary:
Layer Name                                         Shape                          Parameters      Trainable      
--------------------------------------------------------------------------------------------------------------
conv1.weight                                       torch.Size([32, 1, 2])         64              64             
conv1.bias                                         torch.Size([32])               32              32             
conv2.weight                                       torch.Size([64, 32, 2])        4096            4096           
conv2.bias                                         torch.Size([64])               64              64             
fc1.weight                                         torch.Size([128, 64])          8192            8192           
fc1.bias                                           torch.Size([128])              128             128            
fc2.weight                                         torch.Size([1, 128])     