# Aggregate PKL Files (Class: DeltaDDGDataset)

05_08_2025



*   Seshu has loaded structures as graph objects for proteins (0-117)
*   Val has from ~85 - 140 but split into 5 different folds of saved PKL files

Objective: Merge all results into a full saved PKL file that can then be split into respective train, test, split DeltaDDGDataset objects.



In [2]:
!pip install torch_geometric
!pip install biopython
!pip install networkx

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m61.4/63.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1
Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import requests
import networkx as nx
from scipy.spatial.distance import euclidean
from Bio.PDB import PDBParser, PDBList
from torch_geometric.nn import GCNConv, GraphNorm, global_mean_pool

from google.colab import drive
drive.mount('/content/drive')

from torch.utils.data import Dataset

Mounted at /content/drive


In [4]:
class DeltaDDGDataset(Dataset):
    def __init__(self, all_results):
        self.all_results = all_results

    def __len__(self):
        return len(self.all_results)

    def __getitem__(self, idx):
        return self.all_results[idx]

    def save(self, path):
        data = {
            'all_results': self.all_results,
        }
        torch.save(data, path)

    @classmethod
    def load(cls, path):
        data = torch.load(path, weights_only=False)
        obj = cls.__new__(cls)
        obj.all_results = data['all_results']
        return obj

In [5]:
fold_3_dataset_loaded = torch.load("/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/Data_To_Aggregate/fold_3_dataset_no_overlap.pkl", weights_only=False)

fold_1_2_4_dataset_loaded = torch.load("/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/Data_To_Aggregate/fold_1_2_4_no_overlap_dataset.pkl", weights_only=False)

fold_yolo_loaded = torch.load("/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/Data_To_Aggregate/yolo_i_hope_this_works.pkl", weights_only=False)


In [7]:
folds_total_results = (
    fold_3_dataset_loaded.all_results +
    fold_1_2_4_dataset_loaded.all_results +
    fold_yolo_loaded.all_results
)


In [8]:
# Deduplicate based on mt_sequence
indices = []
mt_seqs = []
for i, entry in enumerate(folds_total_results):
    indices.append(i)
    mt_seqs.append(entry[3]["mt_sequence"])

df = pd.DataFrame({"index": indices, "mt_sequence": mt_seqs})
df.drop_duplicates("mt_sequence", inplace=True)

folds_total_deduped = []
for i, entry in enumerate(folds_total_results):
    if i in df["index"].values:
        folds_total_deduped.append(entry)

# Wrap in dataset and save
folds_total_dataset = DeltaDDGDataset(folds_total_deduped)
torch.save(folds_total_dataset, "/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/folds_total_dataset_yuh.pkl")

In [12]:
#### validate loading and veriyfing that there are no duplicates

# Step 1: Load dataset
dataset_path = "/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/folds_total_dataset_yuh.pkl"
loaded_dataset = torch.load(dataset_path, weights_only=False)

# Step 2: Extract all mt_sequences
mt_seqs = [entry[3]["mt_sequence"] for entry in loaded_dataset.all_results]

# Step 3: Create DataFrame to check for duplicates
df = pd.DataFrame({"mt_sequence": mt_seqs})

# Step 4: Check for duplicates
num_total = len(df)
num_unique = df["mt_sequence"].nunique()
num_duplicates = num_total - num_unique

print(f"Total entries: {num_total}")
print(f"Unique mt_sequences: {num_unique}")
print(f"Duplicate entries: {num_duplicates}")

if num_duplicates == 0:
    print("✅ No duplicates found.")
else:
    print("❌ Duplicates exist.")

Total entries: 3763
Unique mt_sequences: 3763
Duplicate entries: 0
✅ No duplicates found.


In [13]:
# Step 2: Load the dataset
dataset_path = "/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/folds_total_dataset_yuh.pkl"
dataset = torch.load(dataset_path, weights_only=False)
all_data = dataset.all_results

# Step 3: Train/Val/Test split using sklearn
train_data, temp_data = train_test_split(all_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Step 4: Wrap and save each split
DeltaDDGDataset(train_data).save("/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/train_dataset.pkl")
DeltaDDGDataset(val_data).save("/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/val_dataset.pkl")
DeltaDDGDataset(test_data).save("/content/drive/MyDrive/BMI_707_Project/707_Files_for_Colab/test_dataset.pkl")

print(f"✅ Splits saved! Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

✅ Splits saved! Train: 3010, Val: 376, Test: 377


In [None]:
fold_3_dataset_loaded_last = torch.load("/content/drive/MyDrive/707/data/fold_3_dataset_last.pkl", weights_only=False)
fold_3_dataset_loaded_first = torch.load("/content/drive/MyDrive/707/data/first_part_dataset_F3.pkl", weights_only=False)

fold_3_overlap = fold_3_dataset_loaded_last.all_results + fold_3_dataset_loaded_first.all_results

df_3 = pd.DataFrame()
indices= []
mt_seqs = []
for i in range(len(fold_3_overlap)):
  indices.append(i)
  mt_seqs.append(fold_3_overlap[i][3]["mt_sequence"])

df_3 = pd.DataFrame({
    "index": indices,
    "mt_sequence": mt_seqs
})

df_3.drop_duplicates("mt_sequence", inplace=True)

fold_3_no_overlap = []
for i,entry in enumerate(fold_3_overlap):
  if i in df_3["index"].values:
    fold_3_no_overlap.append(entry)

len(fold_3_no_overlap)

fold_3_dataset_no_overlap = DeltaDDGDataset(fold_3_no_overlap)
torch.save(fold_3_dataset_no_overlap, "/content/drive/MyDrive/707/data/fold_3_dataset_no_overlap.pkl")