In [None]:
!pip install torch torchvision torchaudio rdkit datasets tokenizers tqdm

In [1]:

#final_version
# stereochemistry_fixed

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch_geometric.nn import MessagePassing, global_mean_pool
from torch_geometric.data import Data, Batch
from datasets import load_dataset
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors, rdFMCS, EnumerateStereoisomers
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator
from tqdm import tqdm
import math
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from torch.cuda.amp import GradScaler, autocast
import optuna
from nltk.translate.bleu_score import sentence_bleu
from Levenshtein import distance
from pandarallel import pandarallel #Added by Pawan
import time  #Added by Pawan
import os #Added by Pawan
import glob #Added by Pawan
%matplotlib inline

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Define token variables early
PAD_TOKEN = "<PAD>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"
MASK_TOKEN = "[MASK]"

In [2]:
# Load and preprocess dataset
dataset = load_dataset('roman-bushuiev/MassSpecGym', split='val')
df = pd.DataFrame(dataset)

In [3]:
# Simulate external dataset (e.g., NIST-like) by splitting
df_massspecgym, df_external = df.iloc[:int(0.9*len(df))], df.iloc[int(0.9*len(df)):]
print("MassSpecGym size:", len(df_massspecgym), "External test size:", len(df_external))


MassSpecGym size: 207993 External test size: 23111


In [4]:
# Inspect dataset
print("Dataset Columns:", df_massspecgym.columns.tolist())
print("\nFirst few rows of MassSpecGym dataset:")
print(df_massspecgym[['identifier', 'mzs', 'intensities', 'smiles', 'adduct', 'precursor_mz']].head())
print("\nUnique adduct values:", df_massspecgym['adduct'].unique())


Dataset Columns: ['identifier', 'mzs', 'intensities', 'smiles', 'inchikey', 'formula', 'precursor_formula', 'parent_mass', 'precursor_mz', 'adduct', 'instrument_type', 'collision_energy', 'fold', 'simulation_challenge']

First few rows of MassSpecGym dataset:
             identifier                                                mzs  \
0  MassSpecGymID0000001  91.0542,125.0233,154.0499,155.0577,185.0961,20...   
1  MassSpecGymID0000002  91.0542,125.0233,155.0577,185.0961,229.0859,24...   
2  MassSpecGymID0000003  69.0343,91.0542,125.0233,127.039,153.0699,154....   
3  MassSpecGymID0000004  69.0343,91.0542,110.06,111.0441,112.0393,120.0...   
4  MassSpecGymID0000005  91.0542,125.0233,185.0961,229.0859,246.1125,28...   

                                         intensities  \
0  0.24524524524524524,1.0,0.08008008008008008,0....   
1  0.0990990990990991,0.28128128128128127,0.04004...   
2  0.03403403403403404,0.31431431431431434,1.0,0....   
3  0.17917917917917917,0.47347347347347346,0.03

In [None]:
identifier 	mzs 	intensities 	inchikey 	formula 	precursor_formula 	parent_mass 	precursor_mz 	adduct 	instrument_type 	collision_energy 	fold 	simulation_challenge 	smiles

In [2]:
# Data augmentation: SMILES enumeration and spectral noise
def augment_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            stereoisomers = EnumerateStereoisomers.EnumerateStereoisomers(mol)
            return [Chem.MolToSmiles(m, canonical=True, doRandom=True) for m in stereoisomers]
        return [smiles]
    except:
        return [smiles]

def bin_spectrum_to_graph(mzs, intensities, ion_mode, precursor_mz, adduct, n_bins=1000, max_mz=1000, noise_level=0.05):
    spectrum = np.zeros(n_bins)
    for mz, intensity in zip(mzs, intensities):
        try:
            mz = float(mz)
            intensity = float(intensity)
            if mz < max_mz:
                bin_idx = int((mz / max_mz) * n_bins)
                spectrum[bin_idx] += intensity
        except (ValueError, TypeError):
            continue
    if spectrum.max() > 0:
        spectrum = spectrum / spectrum.max()
    spectrum += np.random.normal(0, noise_level, spectrum.shape).clip(0, 1)
    x = torch.tensor(spectrum, dtype=torch.float).unsqueeze(-1)
    edge_index = []
    for i in range(n_bins-1):
        edge_index.append([i, i+1])
        edge_index.append([i+1, i])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t()
    ion_mode = torch.tensor([ion_mode], dtype=torch.float)
    precursor_mz = torch.tensor([precursor_mz], dtype=torch.float)
    adduct_idx = adduct_to_idx.get(adduct, 0)
    return spectrum, Data(x=x, edge_index=edge_index, ion_mode=ion_mode, precursor_mz=precursor_mz, adduct_idx=adduct_idx)

In [6]:
# Canonicalize SMILES and augment
pandarallel.initialize(nb_workers=16, progress_bar=True) #Added by Pawan
start_time = time.time()
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
        return None
    except:
        return None

df_massspecgym['smiles'] = df_massspecgym['smiles'].parallel_apply(canonicalize_smiles) # Added by Pawan
df_external['smiles'] = df_external['smiles'].parallel_apply(canonicalize_smiles) #Added by Pawan
df_massspecgym = df_massspecgym.dropna(subset=['smiles']) 
df_external = df_external.dropna(subset=['smiles'])
df_massspecgym['smiles_list'] = df_massspecgym['smiles'].parallel_apply(augment_smiles)
df_massspecgym = df_massspecgym.explode('smiles_list').dropna(subset=['smiles_list'])
df_massspecgym = df_massspecgym.drop(columns=['smiles']) # Drop original 'smiles' to prevent duplicates; added by Pawan
df_massspecgym = df_massspecgym.rename(columns={'smiles_list': 'smiles'}) # Rename exploded list column to 'smiles'; added by Pawan
df_massspecgym.to_parquet("df_massspecgym.parquet")
df_external.to_parquet("df_external.parquet")
print("Completed in {:.2f} seconds".format(time.time() - start_time)) #Added by Pawan


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13000), Label(value='0 / 13000')))…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_massspecgym['smiles'] = df_massspecgym['smiles'].parallel_apply(canonicalize_smiles) # Added by Pawan


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1445), Label(value='0 / 1445'))), …

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_external['smiles'] = df_external['smiles'].parallel_apply(canonicalize_smiles) #Added by Pawan


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13000), Label(value='0 / 13000')))…

Completed in 1823.35 seconds


In [10]:
print(df_massspecgym.shape) #Added by Pawan
print(df_massspecgym.index.nunique()) #Added by Pawan
print(df_external.shape) #Added by Pawan
print(df_external.index.nunique()) #Added by Pawan

(19329189, 14)
207993
(23111, 14)
23111


In [11]:
df_massspecgym.reset_index(drop=True, inplace=True) #Added by Pawan

In [12]:
print(df_massspecgym.shape) #Added by Pawan
print(df_massspecgym.index.nunique()) #Added by Pawan
print(df_external.shape) #Added by Pawan
print(df_external.index.nunique()) #Added by Pawan

(19329189, 14)
19329189
(23111, 14)
23111


In [13]:
df_massspecgym.head(5)

Unnamed: 0,identifier,mzs,intensities,inchikey,formula,precursor_formula,parent_mass,precursor_mz,adduct,instrument_type,collision_energy,fold,simulation_challenge,smiles
0,MassSpecGymID0000001,"91.0542,125.0233,154.0499,155.0577,185.0961,20...","0.24524524524524524,1.0,0.08008008008008008,0....",VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,30.0,train,True,COc1cc(oc(c1)=O)[C@@H](NC(C)=O)Cc1ccccc1
1,MassSpecGymID0000002,"91.0542,125.0233,155.0577,185.0961,229.0859,24...","0.0990990990990991,0.28128128128128127,0.04004...",VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,20.0,train,True,c1ccc(C[C@H](NC(=O)C)c2cc(cc(=O)o2)OC)cc1
2,MassSpecGymID0000003,"69.0343,91.0542,125.0233,127.039,153.0699,154....","0.03403403403403404,0.31431431431431434,1.0,0....",VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,40.0,train,True,c1cccc(c1)C[C@@H](c1cc(cc(o1)=O)OC)NC(C)=O
3,MassSpecGymID0000004,"69.0343,91.0542,110.06,111.0441,112.0393,120.0...","0.17917917917917917,0.47347347347347346,0.0380...",VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,55.0,train,True,c1cc(ccc1)C[C@@H](c1oc(=O)cc(OC)c1)NC(=O)C
4,MassSpecGymID0000005,"91.0542,125.0233,185.0961,229.0859,246.1125,28...","0.07807807807807808,0.1841841841841842,0.03503...",VFMQMACUYWGDOJ,C16H17NO4,C16H18NO4,287.115224,288.1225,[M+H]+,Orbitrap,10.0,train,True,N([C@H](c1oc(cc(OC)c1)=O)Cc1ccccc1)C(C)=O


#Preprocess ion mode, precursor m/z, and adducts
from pandarallel import pandarallel #Added by Pawan
pandarallel.initialize(nb_workers=16, progress_bar=True) #Added by Pawan
import time # Added by Pawan
start_time = time.time()
df_massspecgym['ion_mode'] = df_massspecgym['adduct'].parallel_apply(lambda x: 0 if '+' in str(x) else 1 if '-' in str(x) else 0).fillna(0)
df_massspecgym['precursor_bin'] = pd.qcut(df_massspecgym['precursor_mz'], q=100, labels=False, duplicates='drop')
df_external['ion_mode'] = df_external['adduct'].parallel_apply(lambda x: 0 if '+' in str(x) else 1 if '-' in str(x) else 0).fillna(0)
df_external['precursor_bin'] = pd.qcut(df_external['precursor_mz'], q=100, labels=False, duplicates='drop')
adduct_types = df_massspecgym['adduct'].unique()
adduct_to_idx = {adduct: i for i, adduct in enumerate(adduct_types)}
df_massspecgym['adduct_idx'] = df_massspecgym['adduct'].map(adduct_to_idx)
df_external['adduct_idx'] = df_external['adduct'].map(adduct_to_idx)

df_massspecgym[['binned', 'graph_data']] = df_massspecgym.parallel_apply(
    lambda row: pd.Series(bin_spectrum_to_graph(row['mzs'], row['intensities'], row['ion_mode'], row['precursor_mz'], row['adduct'])),
    axis=1
)
df_external[['binned', 'graph_data']] = df_external.parallel_apply(
    lambda row: pd.Series(bin_spectrum_to_graph(row['mzs'], row['intensities'], row['ion_mode'], row['precursor_mz'], row['adduct'])),
    axis=1
)
print("Completed in {:.2f} seconds".format(time.time() - start_time)) #Added by Pawan

In [3]:
#Preprocess ion mode, precursor m/z, and adducts
#Setup and Load
import os
import time
import pandas as pd
from pandarallel import pandarallel
import pyarrow as pa

pandarallel.initialize(nb_workers=16, progress_bar=True)

# Load datasets
df_massspecgym = pd.read_parquet("df_massspecgym.parquet")
df_external = pd.read_parquet("df_external.parquet")

# Build adduct mapping from df_massspecgym
adduct_types = df_massspecgym['adduct'].unique()
adduct_to_idx = {adduct: i for i, adduct in enumerate(adduct_types)}

# Create output directory
os.makedirs("processed_chunks", exist_ok=True)


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
#Preprocess ion mode, precursor m/z, and adducts
#Define Processing Function
def preprocess_chunk(df_chunk, chunk_idx):
    start_time = time.time()

    df_chunk['ion_mode'] = df_chunk['adduct'].parallel_apply(
        lambda x: 0 if '+' in str(x) else 1 if '-' in str(x) else 0
    ).fillna(0)

    df_chunk['precursor_bin'] = pd.qcut(
        df_chunk['precursor_mz'], q=100, labels=False, duplicates='drop'
    )

    df_chunk['adduct_idx'] = df_chunk['adduct'].map(adduct_to_idx)

    df_chunk[['binned', 'graph_data']] = df_chunk.parallel_apply(
        lambda row: pd.Series(bin_spectrum_to_graph(
            row['mzs'], row['intensities'], row['ion_mode'],
            row['precursor_mz'], row['adduct']
        )),
        axis=1
    )

    # Drop graph_data column before saving to avoid pyarrow error
    df_chunk.drop(columns=['graph_data'], inplace=True)

    df_chunk.to_parquet(f"processed_chunks/df_massspecgym_chunk_{chunk_idx:03}.parquet")
    print(f"✅ Saved chunk {chunk_idx} | Rows: {len(df_chunk)} | Time: {time.time() - start_time:.2f} sec")


In [5]:
#Preprocess ion mode, precursor m/z, and adducts
#Chunk df_massspecgym
chunk_size = 100_000
n_chunks = (len(df_massspecgym) + chunk_size - 1) // chunk_size

for i in range(n_chunks):
    output_file = f"processed_chunks/df_massspecgym_chunk_{i:03}.parquet"
    if os.path.exists(output_file):
        print(f"⏩ Skipping chunk {i} (already exists)")
        continue

    df_chunk = df_massspecgym.iloc[i * chunk_size : (i + 1) * chunk_size].copy()
    preprocess_chunk(df_chunk, i)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 0 | Rows: 100000 | Time: 52.61 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 1 | Rows: 100000 | Time: 64.91 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 2 | Rows: 100000 | Time: 55.39 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 3 | Rows: 100000 | Time: 54.43 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 4 | Rows: 100000 | Time: 56.63 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 5 | Rows: 100000 | Time: 56.80 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 6 | Rows: 100000 | Time: 55.22 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 7 | Rows: 100000 | Time: 55.11 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 8 | Rows: 100000 | Time: 54.97 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 9 | Rows: 100000 | Time: 53.41 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 10 | Rows: 100000 | Time: 53.27 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 11 | Rows: 100000 | Time: 53.84 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 12 | Rows: 100000 | Time: 53.49 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 13 | Rows: 100000 | Time: 53.63 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 14 | Rows: 100000 | Time: 55.54 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 15 | Rows: 100000 | Time: 53.54 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 16 | Rows: 100000 | Time: 53.66 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 17 | Rows: 100000 | Time: 55.17 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 18 | Rows: 100000 | Time: 54.15 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 19 | Rows: 100000 | Time: 54.73 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 20 | Rows: 100000 | Time: 56.42 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 21 | Rows: 100000 | Time: 54.24 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 22 | Rows: 100000 | Time: 54.60 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 23 | Rows: 100000 | Time: 53.51 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 24 | Rows: 100000 | Time: 52.96 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 25 | Rows: 100000 | Time: 54.11 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 26 | Rows: 100000 | Time: 54.78 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 27 | Rows: 100000 | Time: 53.36 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 28 | Rows: 100000 | Time: 53.02 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 29 | Rows: 100000 | Time: 53.29 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 30 | Rows: 100000 | Time: 53.08 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 31 | Rows: 100000 | Time: 53.84 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 32 | Rows: 100000 | Time: 53.15 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 33 | Rows: 100000 | Time: 53.13 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 34 | Rows: 100000 | Time: 56.98 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 35 | Rows: 100000 | Time: 56.03 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 36 | Rows: 100000 | Time: 54.57 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 37 | Rows: 100000 | Time: 53.01 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 38 | Rows: 100000 | Time: 57.84 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 39 | Rows: 100000 | Time: 55.42 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 40 | Rows: 100000 | Time: 55.70 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 41 | Rows: 100000 | Time: 57.48 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 42 | Rows: 100000 | Time: 58.72 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 43 | Rows: 100000 | Time: 57.97 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 44 | Rows: 100000 | Time: 59.25 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 45 | Rows: 100000 | Time: 59.27 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 46 | Rows: 100000 | Time: 58.32 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 47 | Rows: 100000 | Time: 53.31 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 48 | Rows: 100000 | Time: 55.91 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 49 | Rows: 100000 | Time: 57.57 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 50 | Rows: 100000 | Time: 56.63 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 51 | Rows: 100000 | Time: 58.24 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 52 | Rows: 100000 | Time: 58.92 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 53 | Rows: 100000 | Time: 57.24 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 54 | Rows: 100000 | Time: 57.62 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 55 | Rows: 100000 | Time: 59.16 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 56 | Rows: 100000 | Time: 57.05 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 57 | Rows: 100000 | Time: 58.11 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 58 | Rows: 100000 | Time: 60.19 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 59 | Rows: 100000 | Time: 59.55 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 60 | Rows: 100000 | Time: 55.67 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 61 | Rows: 100000 | Time: 57.70 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 62 | Rows: 100000 | Time: 57.26 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 63 | Rows: 100000 | Time: 58.27 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 64 | Rows: 100000 | Time: 55.41 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 65 | Rows: 100000 | Time: 54.48 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 66 | Rows: 100000 | Time: 53.52 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 67 | Rows: 100000 | Time: 54.81 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 68 | Rows: 100000 | Time: 53.88 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 69 | Rows: 100000 | Time: 54.76 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 70 | Rows: 100000 | Time: 54.96 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 71 | Rows: 100000 | Time: 55.29 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 72 | Rows: 100000 | Time: 54.90 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 73 | Rows: 100000 | Time: 55.23 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 74 | Rows: 100000 | Time: 55.60 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 75 | Rows: 100000 | Time: 53.83 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 76 | Rows: 100000 | Time: 54.99 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 77 | Rows: 100000 | Time: 54.22 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 78 | Rows: 100000 | Time: 54.21 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 79 | Rows: 100000 | Time: 53.47 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 80 | Rows: 100000 | Time: 54.33 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 81 | Rows: 100000 | Time: 54.28 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 82 | Rows: 100000 | Time: 54.08 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 83 | Rows: 100000 | Time: 55.78 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 84 | Rows: 100000 | Time: 53.51 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 85 | Rows: 100000 | Time: 55.01 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 86 | Rows: 100000 | Time: 55.19 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 87 | Rows: 100000 | Time: 54.78 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 88 | Rows: 100000 | Time: 53.62 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 89 | Rows: 100000 | Time: 55.40 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 90 | Rows: 100000 | Time: 54.04 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 91 | Rows: 100000 | Time: 56.18 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 92 | Rows: 100000 | Time: 55.13 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 93 | Rows: 100000 | Time: 55.88 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 94 | Rows: 100000 | Time: 55.27 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 95 | Rows: 100000 | Time: 55.54 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 96 | Rows: 100000 | Time: 54.58 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 97 | Rows: 100000 | Time: 53.99 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 98 | Rows: 100000 | Time: 55.62 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 99 | Rows: 100000 | Time: 53.97 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 100 | Rows: 100000 | Time: 55.82 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 101 | Rows: 100000 | Time: 53.20 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 102 | Rows: 100000 | Time: 56.08 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 103 | Rows: 100000 | Time: 55.22 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 104 | Rows: 100000 | Time: 55.63 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 105 | Rows: 100000 | Time: 54.24 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 106 | Rows: 100000 | Time: 55.91 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 107 | Rows: 100000 | Time: 55.11 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 108 | Rows: 100000 | Time: 54.94 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 109 | Rows: 100000 | Time: 54.34 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 110 | Rows: 100000 | Time: 54.43 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 111 | Rows: 100000 | Time: 55.51 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 112 | Rows: 100000 | Time: 54.67 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 113 | Rows: 100000 | Time: 54.32 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 114 | Rows: 100000 | Time: 55.08 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 115 | Rows: 100000 | Time: 54.82 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 116 | Rows: 100000 | Time: 54.97 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 117 | Rows: 100000 | Time: 55.36 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 118 | Rows: 100000 | Time: 55.54 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 119 | Rows: 100000 | Time: 54.93 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 120 | Rows: 100000 | Time: 53.91 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 121 | Rows: 100000 | Time: 55.60 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 122 | Rows: 100000 | Time: 56.46 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 123 | Rows: 100000 | Time: 54.73 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 124 | Rows: 100000 | Time: 55.00 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 125 | Rows: 100000 | Time: 55.30 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 126 | Rows: 100000 | Time: 53.83 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 127 | Rows: 100000 | Time: 53.61 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 128 | Rows: 100000 | Time: 52.53 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 129 | Rows: 100000 | Time: 55.85 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 130 | Rows: 100000 | Time: 55.56 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 131 | Rows: 100000 | Time: 55.16 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 132 | Rows: 100000 | Time: 53.82 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 133 | Rows: 100000 | Time: 55.20 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 134 | Rows: 100000 | Time: 55.19 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 135 | Rows: 100000 | Time: 54.82 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 136 | Rows: 100000 | Time: 56.06 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 137 | Rows: 100000 | Time: 54.97 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 138 | Rows: 100000 | Time: 54.48 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 139 | Rows: 100000 | Time: 54.61 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 140 | Rows: 100000 | Time: 53.74 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 141 | Rows: 100000 | Time: 53.52 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 142 | Rows: 100000 | Time: 55.65 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 143 | Rows: 100000 | Time: 53.97 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 144 | Rows: 100000 | Time: 54.40 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 145 | Rows: 100000 | Time: 55.60 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 146 | Rows: 100000 | Time: 54.78 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 147 | Rows: 100000 | Time: 54.40 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 148 | Rows: 100000 | Time: 54.88 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 149 | Rows: 100000 | Time: 56.27 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 150 | Rows: 100000 | Time: 53.74 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 151 | Rows: 100000 | Time: 55.23 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 152 | Rows: 100000 | Time: 55.26 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 153 | Rows: 100000 | Time: 54.96 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 154 | Rows: 100000 | Time: 56.22 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 155 | Rows: 100000 | Time: 54.70 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 156 | Rows: 100000 | Time: 55.76 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 157 | Rows: 100000 | Time: 53.87 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 158 | Rows: 100000 | Time: 53.97 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 159 | Rows: 100000 | Time: 55.37 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 160 | Rows: 100000 | Time: 55.76 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 161 | Rows: 100000 | Time: 55.58 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 162 | Rows: 100000 | Time: 55.78 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 163 | Rows: 100000 | Time: 55.21 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 164 | Rows: 100000 | Time: 56.27 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 165 | Rows: 100000 | Time: 56.30 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 166 | Rows: 100000 | Time: 56.02 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 167 | Rows: 100000 | Time: 55.57 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 168 | Rows: 100000 | Time: 55.55 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 169 | Rows: 100000 | Time: 56.81 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 170 | Rows: 100000 | Time: 55.74 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 171 | Rows: 100000 | Time: 56.58 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 172 | Rows: 100000 | Time: 56.46 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 173 | Rows: 100000 | Time: 55.59 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 174 | Rows: 100000 | Time: 53.90 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 175 | Rows: 100000 | Time: 55.13 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 176 | Rows: 100000 | Time: 53.81 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 177 | Rows: 100000 | Time: 56.33 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 178 | Rows: 100000 | Time: 55.36 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 179 | Rows: 100000 | Time: 54.97 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 180 | Rows: 100000 | Time: 56.63 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 181 | Rows: 100000 | Time: 54.41 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 182 | Rows: 100000 | Time: 56.11 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 183 | Rows: 100000 | Time: 56.42 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 184 | Rows: 100000 | Time: 56.36 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 185 | Rows: 100000 | Time: 57.40 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 186 | Rows: 100000 | Time: 55.19 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 187 | Rows: 100000 | Time: 55.36 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 188 | Rows: 100000 | Time: 57.28 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 189 | Rows: 100000 | Time: 55.09 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 190 | Rows: 100000 | Time: 56.96 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 191 | Rows: 100000 | Time: 54.87 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6250), Label(value='0 / 6250'))), …

✅ Saved chunk 192 | Rows: 100000 | Time: 57.44 sec


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1825), Label(value='0 / 1825'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1825), Label(value='0 / 1825'))), …

✅ Saved chunk 193 | Rows: 29189 | Time: 20.75 sec


In [4]:
# ✅ Preprocess ion mode, precursor m/z, and adducts
# ✅ Process each chunk efficiently → write to SSD → move to external HDD

import pandas as pd
import pickle
import glob
import gc
import os
import shutil
from pandarallel import pandarallel
from tqdm import tqdm
import time

start_time = time.time()
pandarallel.initialize(nb_workers=8, progress_bar=False)

# ✅ External HDD target directory (Seagate)
external_dir = "/media/onepaw/seagate_manual/graph_data_chunks"
os.makedirs(external_dir, exist_ok=True)

# ✅ Temporary SSD write directory
temp_dir = "graph_data_tmp"
os.makedirs(temp_dir, exist_ok=True)

# ✅ Load adduct mapping
df_massspecgym = pd.read_parquet("df_massspecgym.parquet", columns=["adduct"])
adduct_types = df_massspecgym['adduct'].unique()
adduct_to_idx = {adduct: i for i, adduct in enumerate(adduct_types)}
del df_massspecgym

# ✅ Process each chunk from SSD
chunk_files = sorted(glob.glob("processed_chunks/df_massspecgym_chunk_*.parquet"))

for i, chunk_file in enumerate(tqdm(chunk_files, desc="Processing chunks")):
    df = pd.read_parquet(chunk_file)

    graph_data = df.parallel_apply(
        lambda row: bin_spectrum_to_graph(
            row['mzs'], row['intensities'], row['ion_mode'],
            row['precursor_mz'], row['adduct']
        )[1],
        axis=1
    )

    # ✅ Save to SSD first (fast write)
    temp_path = os.path.join(temp_dir, f"graph_data_chunk_{i:03}.pkl")
    with open(temp_path, "wb") as f:
        pickle.dump(graph_data.tolist(), f)

    # ✅ Then move to external HDD to free SSD space
    final_path = os.path.join(external_dir, f"graph_data_chunk_{i:03}.pkl")
    shutil.move(temp_path, final_path)

    del df
    del graph_data
    gc.collect()

    print(f"✅ Processed and moved: graph_data_chunk_{i:03}.pkl")

print("🎉 All chunks processed and saved to external drive.")
print("🕒 Completed in {:.2f} seconds".format(time.time() - start_time))


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Processing chunks:   1%|▋                                                                                                                             | 1/194 [01:30<4:52:36, 90.97s/it]

✅ Processed and moved: graph_data_chunk_000.pkl


Processing chunks:   1%|█▎                                                                                                                            | 2/194 [02:59<4:46:57, 89.67s/it]

✅ Processed and moved: graph_data_chunk_001.pkl


Processing chunks:   2%|█▉                                                                                                                            | 3/194 [04:28<4:44:22, 89.33s/it]

✅ Processed and moved: graph_data_chunk_002.pkl


Processing chunks:   2%|██▌                                                                                                                           | 4/194 [05:56<4:41:37, 88.94s/it]

✅ Processed and moved: graph_data_chunk_003.pkl


Processing chunks:   3%|███▏                                                                                                                          | 5/194 [07:27<4:42:18, 89.62s/it]

✅ Processed and moved: graph_data_chunk_004.pkl


Processing chunks:   3%|███▉                                                                                                                          | 6/194 [08:57<4:41:18, 89.78s/it]

✅ Processed and moved: graph_data_chunk_005.pkl


Processing chunks:   4%|████▌                                                                                                                         | 7/194 [10:27<4:39:50, 89.79s/it]

✅ Processed and moved: graph_data_chunk_006.pkl


Processing chunks:   4%|█████▏                                                                                                                        | 8/194 [11:56<4:36:51, 89.31s/it]

✅ Processed and moved: graph_data_chunk_007.pkl


Processing chunks:   5%|█████▊                                                                                                                        | 9/194 [13:25<4:35:13, 89.26s/it]

✅ Processed and moved: graph_data_chunk_008.pkl


Processing chunks:   5%|██████▍                                                                                                                      | 10/194 [14:52<4:32:18, 88.80s/it]

✅ Processed and moved: graph_data_chunk_009.pkl


Processing chunks:   6%|███████                                                                                                                      | 11/194 [16:20<4:29:30, 88.36s/it]

✅ Processed and moved: graph_data_chunk_010.pkl


Processing chunks:   6%|███████▋                                                                                                                     | 12/194 [17:48<4:28:13, 88.43s/it]

✅ Processed and moved: graph_data_chunk_011.pkl


Processing chunks:   7%|████████▍                                                                                                                    | 13/194 [19:17<4:26:35, 88.37s/it]

✅ Processed and moved: graph_data_chunk_012.pkl


Processing chunks:   7%|█████████                                                                                                                    | 14/194 [20:45<4:24:41, 88.23s/it]

✅ Processed and moved: graph_data_chunk_013.pkl


Processing chunks:   8%|█████████▋                                                                                                                   | 15/194 [22:13<4:23:26, 88.31s/it]

✅ Processed and moved: graph_data_chunk_014.pkl


Processing chunks:   8%|██████████▎                                                                                                                  | 16/194 [23:41<4:21:20, 88.09s/it]

✅ Processed and moved: graph_data_chunk_015.pkl


Processing chunks:   9%|██████████▉                                                                                                                  | 17/194 [25:09<4:19:55, 88.11s/it]

✅ Processed and moved: graph_data_chunk_016.pkl


Processing chunks:   9%|███████████▌                                                                                                                 | 18/194 [26:37<4:18:32, 88.14s/it]

✅ Processed and moved: graph_data_chunk_017.pkl


Processing chunks:  10%|████████████▏                                                                                                                | 19/194 [28:06<4:18:13, 88.53s/it]

✅ Processed and moved: graph_data_chunk_018.pkl


Processing chunks:  10%|████████████▉                                                                                                                | 20/194 [29:35<4:16:52, 88.57s/it]

✅ Processed and moved: graph_data_chunk_019.pkl


Processing chunks:  11%|█████████████▌                                                                                                               | 21/194 [31:06<4:17:05, 89.17s/it]

✅ Processed and moved: graph_data_chunk_020.pkl


Processing chunks:  11%|██████████████▏                                                                                                              | 22/194 [32:35<4:15:55, 89.28s/it]

✅ Processed and moved: graph_data_chunk_021.pkl


Processing chunks:  12%|██████████████▊                                                                                                              | 23/194 [34:03<4:13:31, 88.95s/it]

✅ Processed and moved: graph_data_chunk_022.pkl


Processing chunks:  12%|███████████████▍                                                                                                             | 24/194 [35:31<4:11:12, 88.66s/it]

✅ Processed and moved: graph_data_chunk_023.pkl


Processing chunks:  13%|████████████████                                                                                                             | 25/194 [36:59<4:08:46, 88.32s/it]

✅ Processed and moved: graph_data_chunk_024.pkl


Processing chunks:  13%|████████████████▊                                                                                                            | 26/194 [38:27<4:07:24, 88.36s/it]

✅ Processed and moved: graph_data_chunk_025.pkl


Processing chunks:  14%|█████████████████▍                                                                                                           | 27/194 [39:55<4:05:17, 88.13s/it]

✅ Processed and moved: graph_data_chunk_026.pkl


Processing chunks:  14%|██████████████████                                                                                                           | 28/194 [41:23<4:04:04, 88.22s/it]

✅ Processed and moved: graph_data_chunk_027.pkl


Processing chunks:  15%|██████████████████▋                                                                                                          | 29/194 [42:51<4:02:00, 88.00s/it]

✅ Processed and moved: graph_data_chunk_028.pkl


Processing chunks:  15%|███████████████████▎                                                                                                         | 30/194 [44:19<4:00:25, 87.96s/it]

✅ Processed and moved: graph_data_chunk_029.pkl


Processing chunks:  16%|███████████████████▉                                                                                                         | 31/194 [45:47<3:58:57, 87.96s/it]

✅ Processed and moved: graph_data_chunk_030.pkl


Processing chunks:  16%|████████████████████▌                                                                                                        | 32/194 [47:15<3:57:55, 88.12s/it]

✅ Processed and moved: graph_data_chunk_031.pkl


Processing chunks:  17%|█████████████████████▎                                                                                                       | 33/194 [48:43<3:56:15, 88.05s/it]

✅ Processed and moved: graph_data_chunk_032.pkl


Processing chunks:  18%|█████████████████████▉                                                                                                       | 34/194 [50:11<3:55:03, 88.15s/it]

✅ Processed and moved: graph_data_chunk_033.pkl


Processing chunks:  18%|██████████████████████▌                                                                                                      | 35/194 [51:40<3:54:16, 88.41s/it]

✅ Processed and moved: graph_data_chunk_034.pkl


Processing chunks:  19%|███████████████████████▏                                                                                                     | 36/194 [53:12<3:54:56, 89.22s/it]

✅ Processed and moved: graph_data_chunk_035.pkl


Processing chunks:  19%|███████████████████████▊                                                                                                     | 37/194 [54:39<3:51:52, 88.62s/it]

✅ Processed and moved: graph_data_chunk_036.pkl


Processing chunks:  20%|████████████████████████▍                                                                                                    | 38/194 [56:08<3:50:37, 88.70s/it]

✅ Processed and moved: graph_data_chunk_037.pkl


Processing chunks:  20%|█████████████████████████▏                                                                                                   | 39/194 [57:39<3:50:48, 89.35s/it]

✅ Processed and moved: graph_data_chunk_038.pkl


Processing chunks:  21%|█████████████████████████▊                                                                                                   | 40/194 [59:08<3:49:11, 89.30s/it]

✅ Processed and moved: graph_data_chunk_039.pkl


Processing chunks:  21%|█████████████████████████▉                                                                                                 | 41/194 [1:00:37<3:47:34, 89.24s/it]

✅ Processed and moved: graph_data_chunk_040.pkl


Processing chunks:  22%|██████████████████████████▋                                                                                                | 42/194 [1:02:08<3:47:40, 89.87s/it]

✅ Processed and moved: graph_data_chunk_041.pkl


Processing chunks:  22%|███████████████████████████▎                                                                                               | 43/194 [1:03:40<3:47:55, 90.57s/it]

✅ Processed and moved: graph_data_chunk_042.pkl


Processing chunks:  23%|███████████████████████████▉                                                                                               | 44/194 [1:05:11<3:46:27, 90.59s/it]

✅ Processed and moved: graph_data_chunk_043.pkl


Processing chunks:  23%|████████████████████████████▌                                                                                              | 45/194 [1:06:45<3:47:19, 91.54s/it]

✅ Processed and moved: graph_data_chunk_044.pkl


Processing chunks:  24%|█████████████████████████████▏                                                                                             | 46/194 [1:08:19<3:47:29, 92.23s/it]

✅ Processed and moved: graph_data_chunk_045.pkl


Processing chunks:  24%|█████████████████████████████▊                                                                                             | 47/194 [1:09:49<3:45:00, 91.84s/it]

✅ Processed and moved: graph_data_chunk_046.pkl


Processing chunks:  25%|██████████████████████████████▍                                                                                            | 48/194 [1:11:18<3:41:00, 90.83s/it]

✅ Processed and moved: graph_data_chunk_047.pkl


Processing chunks:  25%|███████████████████████████████                                                                                            | 49/194 [1:12:47<3:37:52, 90.16s/it]

✅ Processed and moved: graph_data_chunk_048.pkl


Processing chunks:  26%|███████████████████████████████▋                                                                                           | 50/194 [1:14:18<3:37:31, 90.64s/it]

✅ Processed and moved: graph_data_chunk_049.pkl


Processing chunks:  26%|████████████████████████████████▎                                                                                          | 51/194 [1:15:50<3:36:34, 90.87s/it]

✅ Processed and moved: graph_data_chunk_050.pkl


Processing chunks:  27%|████████████████████████████████▉                                                                                          | 52/194 [1:17:21<3:35:12, 90.94s/it]

✅ Processed and moved: graph_data_chunk_051.pkl


Processing chunks:  27%|█████████████████████████████████▌                                                                                         | 53/194 [1:18:55<3:36:07, 91.97s/it]

✅ Processed and moved: graph_data_chunk_052.pkl


Processing chunks:  28%|██████████████████████████████████▏                                                                                        | 54/194 [1:20:27<3:34:16, 91.83s/it]

✅ Processed and moved: graph_data_chunk_053.pkl


Processing chunks:  28%|██████████████████████████████████▊                                                                                        | 55/194 [1:22:00<3:33:36, 92.20s/it]

✅ Processed and moved: graph_data_chunk_054.pkl


Processing chunks:  29%|███████████████████████████████████▌                                                                                       | 56/194 [1:23:33<3:32:29, 92.38s/it]

✅ Processed and moved: graph_data_chunk_055.pkl


Processing chunks:  29%|████████████████████████████████████▏                                                                                      | 57/194 [1:25:05<3:30:54, 92.37s/it]

✅ Processed and moved: graph_data_chunk_056.pkl


Processing chunks:  30%|████████████████████████████████████▊                                                                                      | 58/194 [1:26:39<3:30:21, 92.81s/it]

✅ Processed and moved: graph_data_chunk_057.pkl


Processing chunks:  30%|█████████████████████████████████████▍                                                                                     | 59/194 [1:28:11<3:28:30, 92.67s/it]

✅ Processed and moved: graph_data_chunk_058.pkl


Processing chunks:  31%|██████████████████████████████████████                                                                                     | 60/194 [1:29:44<3:27:22, 92.85s/it]

✅ Processed and moved: graph_data_chunk_059.pkl


Processing chunks:  31%|██████████████████████████████████████▋                                                                                    | 61/194 [1:31:14<3:23:47, 91.94s/it]

✅ Processed and moved: graph_data_chunk_060.pkl


Processing chunks:  32%|███████████████████████████████████████▎                                                                                   | 62/194 [1:32:46<3:22:15, 91.93s/it]

✅ Processed and moved: graph_data_chunk_061.pkl


Processing chunks:  32%|███████████████████████████████████████▉                                                                                   | 63/194 [1:34:19<3:21:04, 92.10s/it]

✅ Processed and moved: graph_data_chunk_062.pkl


Processing chunks:  33%|████████████████████████████████████████▌                                                                                  | 64/194 [1:35:50<3:19:22, 92.02s/it]

✅ Processed and moved: graph_data_chunk_063.pkl


Processing chunks:  34%|█████████████████████████████████████████▏                                                                                 | 65/194 [1:37:20<3:16:00, 91.16s/it]

✅ Processed and moved: graph_data_chunk_064.pkl


Processing chunks:  34%|█████████████████████████████████████████▊                                                                                 | 66/194 [1:38:48<3:12:37, 90.30s/it]

✅ Processed and moved: graph_data_chunk_065.pkl


Processing chunks:  35%|██████████████████████████████████████████▍                                                                                | 67/194 [1:40:16<3:09:49, 89.68s/it]

✅ Processed and moved: graph_data_chunk_066.pkl


Processing chunks:  35%|███████████████████████████████████████████                                                                                | 68/194 [1:41:44<3:07:29, 89.28s/it]

✅ Processed and moved: graph_data_chunk_067.pkl


Processing chunks:  36%|███████████████████████████████████████████▋                                                                               | 69/194 [1:43:13<3:05:35, 89.08s/it]

✅ Processed and moved: graph_data_chunk_068.pkl


Processing chunks:  36%|████████████████████████████████████████████▍                                                                              | 70/194 [1:44:41<3:03:12, 88.65s/it]

✅ Processed and moved: graph_data_chunk_069.pkl


Processing chunks:  37%|█████████████████████████████████████████████                                                                              | 71/194 [1:46:09<3:01:12, 88.40s/it]

✅ Processed and moved: graph_data_chunk_070.pkl


Processing chunks:  37%|█████████████████████████████████████████████▋                                                                             | 72/194 [1:47:37<2:59:43, 88.39s/it]

✅ Processed and moved: graph_data_chunk_071.pkl


Processing chunks:  38%|██████████████████████████████████████████████▎                                                                            | 73/194 [1:49:05<2:57:57, 88.25s/it]

✅ Processed and moved: graph_data_chunk_072.pkl


Processing chunks:  38%|██████████████████████████████████████████████▉                                                                            | 74/194 [1:50:33<2:56:28, 88.24s/it]

✅ Processed and moved: graph_data_chunk_073.pkl


Processing chunks:  39%|███████████████████████████████████████████████▌                                                                           | 75/194 [1:52:01<2:54:40, 88.07s/it]

✅ Processed and moved: graph_data_chunk_074.pkl


Processing chunks:  39%|████████████████████████████████████████████████▏                                                                          | 76/194 [1:53:29<2:53:12, 88.07s/it]

✅ Processed and moved: graph_data_chunk_075.pkl


Processing chunks:  40%|████████████████████████████████████████████████▊                                                                          | 77/194 [1:54:58<2:52:10, 88.30s/it]

✅ Processed and moved: graph_data_chunk_076.pkl


Processing chunks:  40%|█████████████████████████████████████████████████▍                                                                         | 78/194 [1:56:26<2:51:01, 88.46s/it]

✅ Processed and moved: graph_data_chunk_077.pkl


Processing chunks:  41%|██████████████████████████████████████████████████                                                                         | 79/194 [1:57:55<2:49:23, 88.38s/it]

✅ Processed and moved: graph_data_chunk_078.pkl


Processing chunks:  41%|██████████████████████████████████████████████████▋                                                                        | 80/194 [1:59:22<2:47:34, 88.19s/it]

✅ Processed and moved: graph_data_chunk_079.pkl


Processing chunks:  42%|███████████████████████████████████████████████████▎                                                                       | 81/194 [2:00:50<2:45:50, 88.06s/it]

✅ Processed and moved: graph_data_chunk_080.pkl


Processing chunks:  42%|███████████████████████████████████████████████████▉                                                                       | 82/194 [2:02:19<2:44:49, 88.30s/it]

✅ Processed and moved: graph_data_chunk_081.pkl


Processing chunks:  43%|████████████████████████████████████████████████████▌                                                                      | 83/194 [2:03:48<2:43:34, 88.42s/it]

✅ Processed and moved: graph_data_chunk_082.pkl


Processing chunks:  43%|█████████████████████████████████████████████████████▎                                                                     | 84/194 [2:05:16<2:41:56, 88.33s/it]

✅ Processed and moved: graph_data_chunk_083.pkl


Processing chunks:  44%|█████████████████████████████████████████████████████▉                                                                     | 85/194 [2:06:44<2:40:22, 88.28s/it]

✅ Processed and moved: graph_data_chunk_084.pkl


Processing chunks:  44%|██████████████████████████████████████████████████████▌                                                                    | 86/194 [2:08:13<2:39:11, 88.44s/it]

✅ Processed and moved: graph_data_chunk_085.pkl


Processing chunks:  45%|███████████████████████████████████████████████████████▏                                                                   | 87/194 [2:09:41<2:37:46, 88.47s/it]

✅ Processed and moved: graph_data_chunk_086.pkl


Processing chunks:  45%|███████████████████████████████████████████████████████▊                                                                   | 88/194 [2:11:10<2:36:29, 88.58s/it]

✅ Processed and moved: graph_data_chunk_087.pkl


Processing chunks:  46%|████████████████████████████████████████████████████████▍                                                                  | 89/194 [2:12:38<2:34:40, 88.39s/it]

✅ Processed and moved: graph_data_chunk_088.pkl


Processing chunks:  46%|█████████████████████████████████████████████████████████                                                                  | 90/194 [2:14:05<2:32:28, 87.97s/it]

✅ Processed and moved: graph_data_chunk_089.pkl


Processing chunks:  47%|█████████████████████████████████████████████████████████▋                                                                 | 91/194 [2:15:34<2:31:17, 88.14s/it]

✅ Processed and moved: graph_data_chunk_090.pkl


Processing chunks:  47%|██████████████████████████████████████████████████████████▎                                                                | 92/194 [2:17:02<2:29:45, 88.10s/it]

✅ Processed and moved: graph_data_chunk_091.pkl


Processing chunks:  48%|██████████████████████████████████████████████████████████▉                                                                | 93/194 [2:18:29<2:27:56, 87.89s/it]

✅ Processed and moved: graph_data_chunk_092.pkl


Processing chunks:  48%|███████████████████████████████████████████████████████████▌                                                               | 94/194 [2:19:57<2:26:34, 87.94s/it]

✅ Processed and moved: graph_data_chunk_093.pkl


Processing chunks:  49%|████████████████████████████████████████████████████████████▏                                                              | 95/194 [2:21:26<2:25:40, 88.29s/it]

✅ Processed and moved: graph_data_chunk_094.pkl


Processing chunks:  49%|████████████████████████████████████████████████████████████▊                                                              | 96/194 [2:22:54<2:23:55, 88.11s/it]

✅ Processed and moved: graph_data_chunk_095.pkl


Processing chunks:  50%|█████████████████████████████████████████████████████████████▌                                                             | 97/194 [2:24:22<2:22:39, 88.24s/it]

✅ Processed and moved: graph_data_chunk_096.pkl


Processing chunks:  51%|██████████████████████████████████████████████████████████████▏                                                            | 98/194 [2:25:52<2:21:37, 88.51s/it]

✅ Processed and moved: graph_data_chunk_097.pkl


Processing chunks:  51%|██████████████████████████████████████████████████████████████▊                                                            | 99/194 [2:27:21<2:20:22, 88.66s/it]

✅ Processed and moved: graph_data_chunk_098.pkl


Processing chunks:  52%|██████████████████████████████████████████████████████████████▉                                                           | 100/194 [2:28:49<2:18:43, 88.55s/it]

✅ Processed and moved: graph_data_chunk_099.pkl


Processing chunks:  52%|███████████████████████████████████████████████████████████████▌                                                          | 101/194 [2:30:17<2:17:14, 88.54s/it]

✅ Processed and moved: graph_data_chunk_100.pkl


Processing chunks:  53%|████████████████████████████████████████████████████████████████▏                                                         | 102/194 [2:31:46<2:15:51, 88.60s/it]

✅ Processed and moved: graph_data_chunk_101.pkl


Processing chunks:  53%|████████████████████████████████████████████████████████████████▊                                                         | 103/194 [2:33:15<2:14:25, 88.63s/it]

✅ Processed and moved: graph_data_chunk_102.pkl


Processing chunks:  54%|█████████████████████████████████████████████████████████████████▍                                                        | 104/194 [2:34:44<2:12:57, 88.64s/it]

✅ Processed and moved: graph_data_chunk_103.pkl


Processing chunks:  54%|██████████████████████████████████████████████████████████████████                                                        | 105/194 [2:36:12<2:11:25, 88.60s/it]

✅ Processed and moved: graph_data_chunk_104.pkl


Processing chunks:  55%|██████████████████████████████████████████████████████████████████▋                                                       | 106/194 [2:37:40<2:09:47, 88.50s/it]

✅ Processed and moved: graph_data_chunk_105.pkl


Processing chunks:  55%|███████████████████████████████████████████████████████████████████▎                                                      | 107/194 [2:39:08<2:08:09, 88.38s/it]

✅ Processed and moved: graph_data_chunk_106.pkl


Processing chunks:  56%|███████████████████████████████████████████████████████████████████▉                                                      | 108/194 [2:40:37<2:06:36, 88.33s/it]

✅ Processed and moved: graph_data_chunk_107.pkl


Processing chunks:  56%|████████████████████████████████████████████████████████████████████▌                                                     | 109/194 [2:42:05<2:05:02, 88.27s/it]

✅ Processed and moved: graph_data_chunk_108.pkl


Processing chunks:  57%|█████████████████████████████████████████████████████████████████████▏                                                    | 110/194 [2:43:33<2:03:27, 88.18s/it]

✅ Processed and moved: graph_data_chunk_109.pkl


Processing chunks:  57%|█████████████████████████████████████████████████████████████████████▊                                                    | 111/194 [2:45:01<2:01:49, 88.07s/it]

✅ Processed and moved: graph_data_chunk_110.pkl


Processing chunks:  58%|██████████████████████████████████████████████████████████████████████▍                                                   | 112/194 [2:46:29<2:00:29, 88.16s/it]

✅ Processed and moved: graph_data_chunk_111.pkl


Processing chunks:  58%|███████████████████████████████████████████████████████████████████████                                                   | 113/194 [2:47:57<1:59:08, 88.25s/it]

✅ Processed and moved: graph_data_chunk_112.pkl


Processing chunks:  59%|███████████████████████████████████████████████████████████████████████▋                                                  | 114/194 [2:49:25<1:57:33, 88.17s/it]

✅ Processed and moved: graph_data_chunk_113.pkl


Processing chunks:  59%|████████████████████████████████████████████████████████████████████████▎                                                 | 115/194 [2:50:54<1:56:11, 88.24s/it]

✅ Processed and moved: graph_data_chunk_114.pkl


Processing chunks:  60%|████████████████████████████████████████████████████████████████████████▉                                                 | 116/194 [2:52:22<1:54:41, 88.22s/it]

✅ Processed and moved: graph_data_chunk_115.pkl


Processing chunks:  60%|█████████████████████████████████████████████████████████████████████████▌                                                | 117/194 [2:53:50<1:53:13, 88.23s/it]

✅ Processed and moved: graph_data_chunk_116.pkl


Processing chunks:  61%|██████████████████████████████████████████████████████████████████████████▏                                               | 118/194 [2:55:20<1:52:31, 88.83s/it]

✅ Processed and moved: graph_data_chunk_117.pkl


Processing chunks:  61%|██████████████████████████████████████████████████████████████████████████▊                                               | 119/194 [2:56:49<1:50:57, 88.77s/it]

✅ Processed and moved: graph_data_chunk_118.pkl


Processing chunks:  62%|███████████████████████████████████████████████████████████████████████████▍                                              | 120/194 [2:58:17<1:49:14, 88.58s/it]

✅ Processed and moved: graph_data_chunk_119.pkl


Processing chunks:  62%|████████████████████████████████████████████████████████████████████████████                                              | 121/194 [2:59:46<1:47:46, 88.58s/it]

✅ Processed and moved: graph_data_chunk_120.pkl


Processing chunks:  63%|████████████████████████████████████████████████████████████████████████████▋                                             | 122/194 [3:01:14<1:46:13, 88.52s/it]

✅ Processed and moved: graph_data_chunk_121.pkl


Processing chunks:  63%|█████████████████████████████████████████████████████████████████████████████▎                                            | 123/194 [3:02:42<1:44:38, 88.43s/it]

✅ Processed and moved: graph_data_chunk_122.pkl


Processing chunks:  64%|█████████████████████████████████████████████████████████████████████████████▉                                            | 124/194 [3:04:11<1:43:11, 88.45s/it]

✅ Processed and moved: graph_data_chunk_123.pkl


Processing chunks:  64%|██████████████████████████████████████████████████████████████████████████████▌                                           | 125/194 [3:05:39<1:41:35, 88.34s/it]

✅ Processed and moved: graph_data_chunk_124.pkl


Processing chunks:  65%|███████████████████████████████████████████████████████████████████████████████▏                                          | 126/194 [3:07:07<1:40:11, 88.41s/it]

✅ Processed and moved: graph_data_chunk_125.pkl


Processing chunks:  65%|███████████████████████████████████████████████████████████████████████████████▊                                          | 127/194 [3:08:36<1:38:51, 88.53s/it]

✅ Processed and moved: graph_data_chunk_126.pkl


Processing chunks:  66%|████████████████████████████████████████████████████████████████████████████████▍                                         | 128/194 [3:10:04<1:37:15, 88.42s/it]

✅ Processed and moved: graph_data_chunk_127.pkl


Processing chunks:  66%|█████████████████████████████████████████████████████████████████████████████████                                         | 129/194 [3:11:33<1:35:51, 88.48s/it]

✅ Processed and moved: graph_data_chunk_128.pkl


Processing chunks:  67%|█████████████████████████████████████████████████████████████████████████████████▊                                        | 130/194 [3:13:02<1:34:31, 88.61s/it]

✅ Processed and moved: graph_data_chunk_129.pkl


Processing chunks:  68%|██████████████████████████████████████████████████████████████████████████████████▍                                       | 131/194 [3:14:30<1:32:52, 88.45s/it]

✅ Processed and moved: graph_data_chunk_130.pkl


Processing chunks:  68%|███████████████████████████████████████████████████████████████████████████████████                                       | 132/194 [3:15:59<1:31:30, 88.56s/it]

✅ Processed and moved: graph_data_chunk_131.pkl


Processing chunks:  69%|███████████████████████████████████████████████████████████████████████████████████▋                                      | 133/194 [3:17:28<1:30:03, 88.58s/it]

✅ Processed and moved: graph_data_chunk_132.pkl


Processing chunks:  69%|████████████████████████████████████████████████████████████████████████████████████▎                                     | 134/194 [3:18:56<1:28:36, 88.61s/it]

✅ Processed and moved: graph_data_chunk_133.pkl


Processing chunks:  70%|████████████████████████████████████████████████████████████████████████████████████▉                                     | 135/194 [3:20:25<1:27:15, 88.73s/it]

✅ Processed and moved: graph_data_chunk_134.pkl


Processing chunks:  70%|█████████████████████████████████████████████████████████████████████████████████████▌                                    | 136/194 [3:21:53<1:25:33, 88.50s/it]

✅ Processed and moved: graph_data_chunk_135.pkl


Processing chunks:  71%|██████████████████████████████████████████████████████████████████████████████████████▏                                   | 137/194 [3:23:22<1:24:07, 88.55s/it]

✅ Processed and moved: graph_data_chunk_136.pkl


Processing chunks:  71%|██████████████████████████████████████████████████████████████████████████████████████▊                                   | 138/194 [3:24:50<1:22:37, 88.52s/it]

✅ Processed and moved: graph_data_chunk_137.pkl


Processing chunks:  72%|███████████████████████████████████████████████████████████████████████████████████████▍                                  | 139/194 [3:26:19<1:21:09, 88.53s/it]

✅ Processed and moved: graph_data_chunk_138.pkl


Processing chunks:  72%|████████████████████████████████████████████████████████████████████████████████████████                                  | 140/194 [3:27:47<1:19:38, 88.50s/it]

✅ Processed and moved: graph_data_chunk_139.pkl


Processing chunks:  73%|████████████████████████████████████████████████████████████████████████████████████████▋                                 | 141/194 [3:29:15<1:18:03, 88.38s/it]

✅ Processed and moved: graph_data_chunk_140.pkl


Processing chunks:  73%|█████████████████████████████████████████████████████████████████████████████████████████▎                                | 142/194 [3:30:44<1:16:37, 88.41s/it]

✅ Processed and moved: graph_data_chunk_141.pkl


Processing chunks:  74%|█████████████████████████████████████████████████████████████████████████████████████████▉                                | 143/194 [3:32:12<1:15:06, 88.36s/it]

✅ Processed and moved: graph_data_chunk_142.pkl


Processing chunks:  74%|██████████████████████████████████████████████████████████████████████████████████████████▌                               | 144/194 [3:33:40<1:13:32, 88.25s/it]

✅ Processed and moved: graph_data_chunk_143.pkl


Processing chunks:  75%|███████████████████████████████████████████████████████████████████████████████████████████▏                              | 145/194 [3:35:09<1:12:07, 88.32s/it]

✅ Processed and moved: graph_data_chunk_144.pkl


Processing chunks:  75%|███████████████████████████████████████████████████████████████████████████████████████████▊                              | 146/194 [3:36:37<1:10:37, 88.29s/it]

✅ Processed and moved: graph_data_chunk_145.pkl


Processing chunks:  76%|████████████████████████████████████████████████████████████████████████████████████████████▍                             | 147/194 [3:38:06<1:09:15, 88.42s/it]

✅ Processed and moved: graph_data_chunk_146.pkl


Processing chunks:  76%|█████████████████████████████████████████████████████████████████████████████████████████████                             | 148/194 [3:39:34<1:07:53, 88.55s/it]

✅ Processed and moved: graph_data_chunk_147.pkl


Processing chunks:  77%|█████████████████████████████████████████████████████████████████████████████████████████████▋                            | 149/194 [3:41:03<1:06:22, 88.51s/it]

✅ Processed and moved: graph_data_chunk_148.pkl


Processing chunks:  77%|██████████████████████████████████████████████████████████████████████████████████████████████▎                           | 150/194 [3:42:32<1:05:01, 88.68s/it]

✅ Processed and moved: graph_data_chunk_149.pkl


Processing chunks:  78%|██████████████████████████████████████████████████████████████████████████████████████████████▉                           | 151/194 [3:44:01<1:03:36, 88.75s/it]

✅ Processed and moved: graph_data_chunk_150.pkl


Processing chunks:  78%|███████████████████████████████████████████████████████████████████████████████████████████████▌                          | 152/194 [3:45:29<1:02:02, 88.64s/it]

✅ Processed and moved: graph_data_chunk_151.pkl


Processing chunks:  79%|████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 153/194 [3:46:58<1:00:35, 88.67s/it]

✅ Processed and moved: graph_data_chunk_152.pkl


Processing chunks:  79%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 154/194 [3:48:27<59:09, 88.75s/it]

✅ Processed and moved: graph_data_chunk_153.pkl


Processing chunks:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████                         | 155/194 [3:49:57<57:59, 89.21s/it]

✅ Processed and moved: graph_data_chunk_154.pkl


Processing chunks:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 156/194 [3:51:26<56:23, 89.04s/it]

✅ Processed and moved: graph_data_chunk_155.pkl


Processing chunks:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 157/194 [3:52:54<54:51, 88.95s/it]

✅ Processed and moved: graph_data_chunk_156.pkl


Processing chunks:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                       | 158/194 [3:54:22<53:10, 88.63s/it]

✅ Processed and moved: graph_data_chunk_157.pkl


Processing chunks:  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                      | 159/194 [3:55:51<51:42, 88.65s/it]

✅ Processed and moved: graph_data_chunk_158.pkl


Processing chunks:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 160/194 [3:57:19<50:08, 88.49s/it]

✅ Processed and moved: graph_data_chunk_159.pkl


Processing chunks:  83%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 161/194 [3:58:49<48:49, 88.78s/it]

✅ Processed and moved: graph_data_chunk_160.pkl


Processing chunks:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                    | 162/194 [4:00:17<47:18, 88.71s/it]

✅ Processed and moved: graph_data_chunk_161.pkl


Processing chunks:  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 163/194 [4:01:45<45:45, 88.57s/it]

✅ Processed and moved: graph_data_chunk_162.pkl


Processing chunks:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 164/194 [4:03:14<44:18, 88.61s/it]

✅ Processed and moved: graph_data_chunk_163.pkl


Processing chunks:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 165/194 [4:04:43<42:48, 88.56s/it]

✅ Processed and moved: graph_data_chunk_164.pkl


Processing chunks:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 166/194 [4:06:11<41:15, 88.42s/it]

✅ Processed and moved: graph_data_chunk_165.pkl


Processing chunks:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 167/194 [4:07:39<39:47, 88.41s/it]

✅ Processed and moved: graph_data_chunk_166.pkl


Processing chunks:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 168/194 [4:09:08<38:23, 88.60s/it]

✅ Processed and moved: graph_data_chunk_167.pkl


Processing chunks:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 169/194 [4:10:37<36:55, 88.60s/it]

✅ Processed and moved: graph_data_chunk_168.pkl


Processing chunks:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋               | 170/194 [4:12:05<35:25, 88.55s/it]

✅ Processed and moved: graph_data_chunk_169.pkl


Processing chunks:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 171/194 [4:13:34<33:55, 88.50s/it]

✅ Processed and moved: graph_data_chunk_170.pkl


Processing chunks:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 172/194 [4:15:02<32:29, 88.62s/it]

✅ Processed and moved: graph_data_chunk_171.pkl


Processing chunks:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 173/194 [4:16:31<31:00, 88.59s/it]

✅ Processed and moved: graph_data_chunk_172.pkl


Processing chunks:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 174/194 [4:17:59<29:31, 88.57s/it]

✅ Processed and moved: graph_data_chunk_173.pkl


Processing chunks:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 175/194 [4:19:28<28:04, 88.64s/it]

✅ Processed and moved: graph_data_chunk_174.pkl


Processing chunks:  91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 176/194 [4:20:56<26:32, 88.47s/it]

✅ Processed and moved: graph_data_chunk_175.pkl


Processing chunks:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 177/194 [4:22:25<25:03, 88.47s/it]

✅ Processed and moved: graph_data_chunk_176.pkl


Processing chunks:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 178/194 [4:23:53<23:34, 88.42s/it]

✅ Processed and moved: graph_data_chunk_177.pkl


Processing chunks:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍         | 179/194 [4:25:22<22:06, 88.42s/it]

✅ Processed and moved: graph_data_chunk_178.pkl


Processing chunks:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 180/194 [4:26:50<20:39, 88.57s/it]

✅ Processed and moved: graph_data_chunk_179.pkl


Processing chunks:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 181/194 [4:28:19<19:10, 88.51s/it]

✅ Processed and moved: graph_data_chunk_180.pkl


Processing chunks:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 182/194 [4:29:48<17:43, 88.59s/it]

✅ Processed and moved: graph_data_chunk_181.pkl


Processing chunks:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 183/194 [4:31:17<16:17, 88.82s/it]

✅ Processed and moved: graph_data_chunk_182.pkl


Processing chunks:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 184/194 [4:32:46<14:47, 88.76s/it]

✅ Processed and moved: graph_data_chunk_183.pkl


Processing chunks:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 185/194 [4:34:14<13:18, 88.73s/it]

✅ Processed and moved: graph_data_chunk_184.pkl


Processing chunks:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 186/194 [4:35:43<11:49, 88.65s/it]

✅ Processed and moved: graph_data_chunk_185.pkl


Processing chunks:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 187/194 [4:37:12<10:21, 88.82s/it]

✅ Processed and moved: graph_data_chunk_186.pkl


Processing chunks:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 188/194 [4:38:41<08:53, 88.84s/it]

✅ Processed and moved: graph_data_chunk_187.pkl


Processing chunks:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 189/194 [4:40:09<07:23, 88.69s/it]

✅ Processed and moved: graph_data_chunk_188.pkl


Processing chunks:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍  | 190/194 [4:41:38<05:54, 88.70s/it]

✅ Processed and moved: graph_data_chunk_189.pkl


Processing chunks:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 191/194 [4:43:06<04:25, 88.66s/it]

✅ Processed and moved: graph_data_chunk_190.pkl


Processing chunks:  99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 192/194 [4:44:35<02:57, 88.51s/it]

✅ Processed and moved: graph_data_chunk_191.pkl


Processing chunks:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 193/194 [4:46:04<01:28, 88.73s/it]

✅ Processed and moved: graph_data_chunk_192.pkl


Processing chunks: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 194/194 [4:46:30<00:00, 88.61s/it]

✅ Processed and moved: graph_data_chunk_193.pkl
🎉 All chunks processed and saved to external drive.
🕒 Completed in 17191.72 seconds





In [3]:
# Stream-load each chunk and write them one by one to HDD to avoid RAM issues
import pickle
import glob
import os
from tqdm import tqdm

# ✅ Directory where pickled graph_data chunks are stored (on external HDD)
external_dir = "/media/onepaw/seagate_manual/graph_data_chunks"
chunk_files = sorted(glob.glob(os.path.join(external_dir, "graph_data_chunk_*.pkl")))

# ✅ Output path on external HDD
merged_path = "/media/onepaw/seagate_manual/df_massspecgym_graph_data_streamed.pkl"

# ✅ Open final output in append-binary mode
with open(merged_path, "wb") as out_f:
    for chunk_file in tqdm(chunk_files, desc="Merging (streamed)"):
        with open(chunk_file, "rb") as in_f:
            data = pickle.load(in_f)
            # Stream-write this chunk to output
            pickle.dump(data, out_f)
            del data  # ensure chunk gets garbage collected

print(f"✅ Streamed merge completed to: {merged_path}")


Merging (streamed): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 194/194 [6:09:52<00:00, 114.39s/it]

✅ Streamed merge completed to: /media/onepaw/seagate_manual/df_massspecgym_graph_data_streamed.pkl





In [7]:
#Preprocess ion mode, precursor m/z, and adducts
#Full Processing of df_external
import time
import pickle

start_time = time.time()

# Compute ion_mode
df_external['ion_mode'] = df_external['adduct'].parallel_apply(
    lambda x: 0 if '+' in str(x) else 1 if '-' in str(x) else 0
).fillna(0)

# Compute precursor_bin
df_external['precursor_bin'] = pd.qcut(
    df_external['precursor_mz'], q=100, labels=False, duplicates='drop'
)

# Map adduct to index
df_external['adduct_idx'] = df_external['adduct'].map(adduct_to_idx)

# Generate binned and graph_data columns
df_external[['binned', 'graph_data']] = df_external.parallel_apply(
    lambda row: pd.Series(bin_spectrum_to_graph(
        row['mzs'], row['intensities'], row['ion_mode'],
        row['precursor_mz'], row['adduct']
    )),
    axis=1
)

# 🔒 Save graph_data separately (optional)
with open("df_external_graph_data.pkl", "wb") as f:
    pickle.dump(df_external['graph_data'].tolist(), f)

# ❌ Drop graph_data column before saving to parquet
df_external.drop(columns=['graph_data'], inplace=True)

# ✅ Save remaining data to Parquet
df_external.to_parquet("df_external_processed.parquet")

print("✅ df_external processed and saved in {:.2f} seconds".format(time.time() - start_time))


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1445), Label(value='0 / 1445'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1445), Label(value='0 / 1445'))), …

✅ df_external processed and saved in 33.56 seconds


In [3]:
#Preprocess ion mode, precursor m/z, and adducts
import pandas as pd
import glob
import pyarrow as pa
import pyarrow.parquet as pq
import time

output_path = "df_massspecgym_processed_full.parquet"
chunk_files = sorted(glob.glob("processed_chunks/df_massspecgym_chunk_*.parquet"))

first_df = pd.read_parquet(chunk_files[0])
first_df['precursor_bin'] = first_df['precursor_bin'].fillna(-1).astype('int64')

table = pa.Table.from_pandas(first_df)
writer = pq.ParquetWriter(output_path, table.schema)

writer.write_table(table)
print(f"✅ Wrote chunk 0 / {len(chunk_files)}")

for i, file in enumerate(chunk_files[1:], start=1):
    start = time.time()
    df = pd.read_parquet(file)

    df['precursor_bin'] = df['precursor_bin'].fillna(-1).astype('int64')
    df = df[first_df.columns]

    table = pa.Table.from_pandas(df)
    writer.write_table(table)
    print(f"✅ Wrote chunk {i} / {len(chunk_files)} in {time.time() - start:.2f}s")

writer.close()
print(f"🎉 Done merging {len(chunk_files)} chunks ➜ {output_path}")


✅ Wrote chunk 0 / 194
✅ Wrote chunk 1 / 194 in 6.07s
✅ Wrote chunk 2 / 194 in 6.17s
✅ Wrote chunk 3 / 194 in 6.10s
✅ Wrote chunk 4 / 194 in 6.05s
✅ Wrote chunk 5 / 194 in 6.10s
✅ Wrote chunk 6 / 194 in 5.94s
✅ Wrote chunk 7 / 194 in 5.91s
✅ Wrote chunk 8 / 194 in 6.04s
✅ Wrote chunk 9 / 194 in 5.93s
✅ Wrote chunk 10 / 194 in 5.93s
✅ Wrote chunk 11 / 194 in 6.01s
✅ Wrote chunk 12 / 194 in 6.34s
✅ Wrote chunk 13 / 194 in 7.31s
✅ Wrote chunk 14 / 194 in 6.02s
✅ Wrote chunk 15 / 194 in 6.03s
✅ Wrote chunk 16 / 194 in 5.95s
✅ Wrote chunk 17 / 194 in 5.95s
✅ Wrote chunk 18 / 194 in 6.70s
✅ Wrote chunk 19 / 194 in 6.01s
✅ Wrote chunk 20 / 194 in 6.07s
✅ Wrote chunk 21 / 194 in 5.99s
✅ Wrote chunk 22 / 194 in 5.92s
✅ Wrote chunk 23 / 194 in 6.16s
✅ Wrote chunk 24 / 194 in 5.94s
✅ Wrote chunk 25 / 194 in 5.85s
✅ Wrote chunk 26 / 194 in 5.92s
✅ Wrote chunk 27 / 194 in 5.90s
✅ Wrote chunk 28 / 194 in 5.87s
✅ Wrote chunk 29 / 194 in 7.08s
✅ Wrote chunk 30 / 194 in 5.94s
✅ Wrote chunk 31 / 194 in 5

In [3]:
# SMILES Tokenization with Stereochemistry (Final Corrected Version)
import pyarrow.parquet as pq
import time
import re

# Special tokens
PAD_TOKEN = "<PAD>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"
MASK_TOKEN = "[MASK]"

# ✅ FIX: Define a proper RegEx tokenizer for SMILES that handles multi-character elements
SMILES_TOKENIZER_PATTERN =  r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
smiles_regex = re.compile(SMILES_TOKENIZER_PATTERN)

def smiles_tokenizer(smiles):
    """Tokenize a SMILES string using the regular expression."""
    return [token for token in smiles_regex.findall(smiles)]

# Open the large Parquet file
parquet_file = pq.ParquetFile("df_massspecgym_processed_full.parquet")

start = time.time()
# Step 1: Build vocabulary from all SMILES *tokens*
all_tokens = set()
for i in range(parquet_file.num_row_groups):
    table = parquet_file.read_row_group(i, columns=["smiles"])
    df = table.to_pandas()
    for smiles in df['smiles'].dropna().astype(str):
        all_tokens.update(smiles_tokenizer(smiles))

# ✅ FIX: Create the vocabulary from the tokenized list
special_tokens = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, MASK_TOKEN]
tokens = special_tokens + sorted(list(all_tokens))

# This now creates a correct, contiguous mapping from 0 to vocab_size-1
token_to_idx = {tok: i for i, tok in enumerate(tokens)}
idx_to_token = {i: tok for tok, i in token_to_idx.items()}
vocab_size = len(tokens)

# Determine supervised max length (based on number of TOKENS, not characters)
SUPERVISED_MAX_LEN = 0
for i in range(parquet_file.num_row_groups):
    table = parquet_file.read_row_group(i, columns=["smiles"])
    df = table.to_pandas()
    if not df.empty and 'smiles' in df.columns and not df['smiles'].dropna().empty:
        max_len = max(len(smiles_tokenizer(s)) for s in df['smiles'].dropna().astype(str))
        SUPERVISED_MAX_LEN = max(SUPERVISED_MAX_LEN, max_len + 2) # +2 for SOS and EOS

PRETRAIN_MAX_LEN = 100

print(f"✅ Vocabulary size: {vocab_size}, Supervised MAX_LEN: {SUPERVISED_MAX_LEN}, Pretrain MAX_LEN: {PRETRAIN_MAX_LEN}")
print("Sample of token_to_idx to verify 'Cl' exists:", {k: v for k, v in token_to_idx.items() if 'Cl' in k or 'Br' in k})
print(f"Completed in {time.time() - start:.2f}s")


# Step 2: Define the NEW encoder function using the tokenizer
def encode_smiles(smiles, max_len=PRETRAIN_MAX_LEN):
    tokenized_smiles = smiles_tokenizer(smiles)
    tokens_with_specials = [SOS_TOKEN] + tokenized_smiles[:max_len - 2] + [EOS_TOKEN]
    
    # Use .get() with a default for any unknown tokens (though unlikely with regex)
    token_ids = [token_to_idx.get(tok, token_to_idx[PAD_TOKEN]) for tok in tokens_with_specials]
    
    # Padding
    if len(token_ids) < max_len:
        token_ids += [token_to_idx[PAD_TOKEN]] * (max_len - len(token_ids))
    
    return token_ids[:max_len]

✅ Vocabulary size: 78, Supervised MAX_LEN: 148, Pretrain MAX_LEN: 100
Sample of token_to_idx to verify 'Cl' exists: {'Br': 25, 'Cl': 27, '[79Br]': 34}
Completed in 392.26s


In [3]:
# Precompute Morgan fingerprints
import pandas as pd
import pyarrow.parquet as pq
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from joblib import Parallel, delayed
import pickle
import time

start = time.time()

# Load df_massspecgym from large Parquet file (only SMILES column)
massspec_parquet = pq.ParquetFile("df_massspecgym_processed_full.parquet")
df_massspecgym = pd.concat([
    massspec_parquet.read_row_group(i, columns=["smiles"]).to_pandas()
    for i in range(massspec_parquet.num_row_groups)
], ignore_index=True)

# Load df_external from smaller Parquet file
df_external = pd.read_parquet("df_external_processed.parquet")

# Combine and deduplicate SMILES
all_smiles = list(set(df_massspecgym['smiles'].dropna().tolist() + df_external['smiles'].dropna().tolist()))

# Function that avoids unpicklable generator
def fingerprint_one(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
            return smiles, generator.GetFingerprint(mol)
    except Exception as e:
        print(f"Failed: {smiles} → {e}")
    return smiles, None

# Run parallel computation with 12 CPUs
results = Parallel(n_jobs=12, verbose=5)(
    delayed(fingerprint_one)(s) for s in all_smiles
)

# Collect into dictionary
all_fingerprints = {s: fp for s, fp in results if fp is not None}

# Save to file
with open("all_morgan_fingerprints.pkl", "wb") as f:
    pickle.dump(all_fingerprints, f)

print(f"✅ Done in {time.time() - start:.2f}s — {len(all_fingerprints)} fingerprints saved to all_morgan_fingerprints.pkl")


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  93 tasks      | elapsed:    0.5s
[Parallel(n_jobs=12)]: Done 1320 tasks      | elapsed:    0.9s
[Parallel(n_jobs=12)]: Done 49128 tasks      | elapsed:    2.5s
[Parallel(n_jobs=12)]: Done 215016 tasks      | elapsed:    7.7s
[Parallel(n_jobs=12)]: Done 417768 tasks      | elapsed:   13.6s
[Parallel(n_jobs=12)]: Done 657384 tasks      | elapsed:   20.5s
[Parallel(n_jobs=12)]: Done 933864 tasks      | elapsed:   28.4s
[Parallel(n_jobs=12)]: Done 1247208 tasks      | elapsed:   37.5s
[Parallel(n_jobs=12)]: Done 1597416 tasks      | elapsed:   47.1s
[Parallel(n_jobs=12)]: Done 1984488 tasks      | elapsed:   57.6s
[Parallel(n_jobs=12)]: Done 2408424 tasks      | elapsed:  1.2min
[Parallel(n_jobs=12)]: Done 2869224 tasks      | elapsed:  1.4min
[Parallel(n_jobs=12)]: Done 3366888 tasks      | elapsed:  1.6min
[Parallel(n_jobs=12)]: Done 3901416 tasks      | elapsed:  1.9min
[Parallel(n_

✅ Done in 667.19s — 19320594 fingerprints saved to all_morgan_fingerprints.pkl


In [17]:
# MSMSDataset Class (Final Robust Version)
import pickle
from torch.utils.data import Dataset
from torch_geometric.data import Data
import pandas as pd
import torch

class MSMSDataset(Dataset):
    def __init__(self, dataframe, graph_data_list, max_len=272):
        self.df = dataframe.reset_index(drop=True)
        self.graph_data_list = graph_data_list
        self.max_len = max_len

        if len(self.df) != len(self.graph_data_list):
            raise ValueError(f"DataFrame length ({len(self.df)}) and graph_data_list length ({len(self.graph_data_list)}) must match.")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        spectrum = torch.tensor(row["binned"], dtype=torch.float)
        graph = self.graph_data_list[idx]

        if not isinstance(graph, Data):
            if isinstance(graph, tuple) and len(graph) > 0 and isinstance(graph[0], Data):
                graph = graph[0]
            else:
                # Create a placeholder for corrupted data
                graph = Data(x=torch.zeros((1, 1), dtype=torch.float), edge_index=torch.empty((2, 0), dtype=torch.long))

        # ✅ FIX #1: Handle potential NaN in precursor_bin
        precursor_bin_val = row["precursor_bin"]
        if pd.isna(precursor_bin_val):
            precursor_bin_val = 0 # Use a safe default
        
        # ✅ FIX #2 (The Critical One): Handle potential NaN in adduct_idx
        adduct_idx_val = row["adduct_idx"]
        if pd.isna(adduct_idx_val):
            adduct_idx_val = 0 # Default to the first adduct ('[M+H]+') if missing
        
        ion_mode = torch.tensor(row["ion_mode"], dtype=torch.long)
        precursor_bin = torch.tensor(int(precursor_bin_val), dtype=torch.long)
        adduct_idx = torch.tensor(int(adduct_idx_val), dtype=torch.long)
        raw_smiles = row["smiles"]
        smiles_tensor = torch.tensor(encode_smiles(raw_smiles, self.max_len), dtype=torch.long)
            
        return (spectrum, graph, smiles_tensor, ion_mode, precursor_bin, adduct_idx, raw_smiles)

In [4]:
#TEST
from torch_geometric.data import Data

for i in range(10):
    _, graph, *_ = train_dataset[i]  # ✅ correct unpacking
    print(f"Index {i}: type={type(graph)}")
    if isinstance(graph, Data):
        print("  ✅ Correct type: torch_geometric.data.Data")
    else:
        print(f"  ❌ Unexpected graph type: {type(graph)}")


NameError: name 'train_dataset' is not defined

In [18]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]


In [19]:
# Transformer Encoder
class SpectrumTransformerEncoder(nn.Module):
    def __init__(self, input_dim=1000, d_model=768, nhead=12, num_layers=8, dim_feedforward=2048, dropout=0.2):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.metadata_emb = nn.Linear(2 + 32, 64)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.norm = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model + 64, d_model // 2)
        self.adduct_emb = nn.Embedding(len(adduct_types), 32)

    def forward(self, src, ion_mode_idx, precursor_idx, adduct_idx):
        src = self.input_proj(src).unsqueeze(1)
        adduct_embed = self.adduct_emb(adduct_idx)
        metadata = self.metadata_emb(torch.cat([ion_mode_idx.unsqueeze(-1).float(), precursor_idx.unsqueeze(-1).float(), adduct_embed], dim=-1))
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src).squeeze(1)
        output = self.norm(output)
        output = torch.cat([output, metadata], dim=-1)
        output = self.fc(output)
        return output, self.transformer_encoder.layers[-1].self_attn(src, src, src)[1]

In [20]:
# GNN Encoder (Corrected for Batched Input)
from torch_geometric.nn import global_mean_pool, MessagePassing

class SpectrumGNNEncoder(MessagePassing):
    def __init__(self, d_model=768, hidden_dim=256, num_layers=3, dropout=0.2):
        super().__init__(aggr='mean')
        self.d_model = d_model
        self.num_layers = num_layers
        self.input_proj = nn.Linear(1, hidden_dim)
        self.message_nets = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for _ in range(num_layers)])
        self.update_nets = nn.ModuleList([nn.GRUCell(hidden_dim, hidden_dim) for _ in range(num_layers)])
        self.metadata_emb = nn.Linear(2 + 32, hidden_dim)
        self.norm = nn.LayerNorm(hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, d_model // 2)
        self.dropout = nn.Dropout(dropout)
        self.substructure_head = nn.Linear(hidden_dim, 30)
        self.adduct_emb = nn.Embedding(len(adduct_types), 32)
        self.substructures = ['C=O', 'C=C', 'c1ccccc1', 'C#N', 'C(=O)O', 'N=O', 'S=O', 'P=O', 'C#C', 'C-N-C',
                              'C-O-C', 'C-S-C', 'C(=O)N', 'C(=O)S', 'C=C-C', 'c1ccncc1', 'c1cncnc1', 'c1ccoc1',
                              'c1ccsc1', 'C(=O)C', 'N-C-N', 'S-C-S', 'P-C-P', 'C-F', 'C-Cl', 'C-Br', 'C-I', 'N-N',
                              'O-O', 'S-S']

    def forward(self, batch_data, ion_mode_idx, precursor_idx, adduct_idx):
        # ✅ FIX: The input 'batch_data' is ALREADY a Batch object. No need to call Batch.from_data_list().
        x, edge_index, batch = batch_data.x, batch_data.edge_index, batch_data.batch

        adduct_embed = self.adduct_emb(adduct_idx)
        
        metadata_per_graph = self.metadata_emb(torch.cat([
            ion_mode_idx.unsqueeze(-1).float(), 
            precursor_idx.unsqueeze(-1).float(),
            adduct_embed
        ], dim=-1))
        
        # ✅ FIX: Expand metadata from graph-level to node-level
        metadata = metadata_per_graph[batch]

        x = self.input_proj(x)
        h = F.relu(x) # Initial hidden state

        for i in range(self.num_layers):
            # Store layer index for message passing
            self._propagate_layer = i
            m = self.propagate(edge_index, x=h)
            
            # Apply metadata at each update step
            m = m + metadata 
            
            h = self.update_nets[i](m, h)
            h = self.dropout(h)
    
        pooled_x = global_mean_pool(h, batch)
        pooled_x = self.norm(pooled_x)

        substructure_pred = self.substructure_head(pooled_x)
        output = self.output_layer(pooled_x)
        
        return output, substructure_pred, None # Return None for edge_weights for now

    def message(self, x_j):
        layer_idx = getattr(self, '_propagate_layer', 0)
        return self.message_nets[layer_idx](x_j)

In [21]:
# Novel Decoder with Stereochemistry and Substructure Guidance (Corrected Valence Calculation)
class SmilesTransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=768, nhead=12, num_layers=8, dim_feedforward=2048, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.norm = nn.LayerNorm(d_model)
        self.output_layer = nn.Linear(d_model, vocab_size)
        self.d_model = d_model
        self.valence_rules = {
            'C': 4, 'N': 3, 'O': 2, 'S': 2, 'P': 3, 'F': 1, 'Cl': 1, 'Br': 1, 'I': 1, 'H': 1
        }
        self.substructure_condition = nn.Linear(30, d_model)

    def compute_valence(self, smiles_token_ids, batch_size):
        # ✅ FIX: This function now operates on token IDs, not characters
        valence_counts = torch.zeros(batch_size, device=smiles_token_ids.device)
        
        for t in range(smiles_token_ids.size(1)):
            for i in range(batch_size):
                token_id = smiles_token_ids[i, t].item()
                token_str = idx_to_token.get(token_id)
                
                if token_str in self.valence_rules:
                    # This is a simplified valence calculation
                    valence_counts[i] += self.valence_rules[token_str]
                elif token_str == '=':
                    valence_counts[i] -= 2
                elif token_str == '#':
                    valence_counts[i] -= 3
        
        # A simple penalty for atoms that are likely unsaturated
        return (valence_counts % 2).float()

    def forward(self, tgt, memory, substructure_pred, tgt_mask=None, memory_key_padding_mask=None):
        embedded = self.embedding(tgt) * math.sqrt(self.d_model)
        embedded = self.pos_encoder(embedded)
        substructure_emb = self.substructure_condition(substructure_pred).unsqueeze(1)
        embedded = embedded + substructure_emb
        output = self.transformer_decoder(embedded, memory, tgt_mask, memory_key_padding_mask)
        output = self.norm(output)
        logits = self.output_layer(output)
        valence_penalty = self.compute_valence(tgt, tgt.size(0))
        return logits, valence_penalty

In [22]:
class MSMS2SmilesHybrid(nn.Module):
    def __init__(self, vocab_size, d_model=768, nhead=12, num_layers=8, dim_feedforward=2048, dropout=0.2, fp_size=2048):
        super().__init__()
        self.transformer_encoder = SpectrumTransformerEncoder(input_dim=1000, d_model=d_model, nhead=nhead, num_layers=num_layers, dim_feedforward=dim_feedforward, dropout=dropout)
        self.gnn_encoder = SpectrumGNNEncoder(d_model=d_model, hidden_dim=256, num_layers=3, dropout=dropout)
        self.decoder = SmilesTransformerDecoder(vocab_size, d_model, nhead, num_layers, dim_feedforward, dropout)
        self.combine_layer = nn.Linear(d_model, d_model)
        self.fp_head = nn.Linear(d_model, fp_size)
        self.fp_size = fp_size
        self.log_sigma_smiles = nn.Parameter(torch.zeros(1))
        self.log_sigma_fp = nn.Parameter(torch.zeros(1))
        self.log_sigma_sub = nn.Parameter(torch.zeros(1))

    def generate_square_subsequent_mask(self, tgt_len):
        mask = torch.triu(torch.ones(tgt_len, tgt_len), diagonal=1)
        mask = mask.float().masked_fill(mask == 1, float('-inf')).masked_fill(mask == 0, float(0.0))
        return mask.to(next(self.parameters()).device)  # 🔧 Ensure mask is on same device as model

    def forward(self, spectrum, graph_data, tgt, ion_mode_idx, precursor_idx, adduct_idx, tgt_mask=None, memory_key_padding_mask=None):
        trans_output, attn_weights = self.transformer_encoder(spectrum, ion_mode_idx, precursor_idx, adduct_idx)
        gnn_output, substructure_pred, edge_weights = self.gnn_encoder(graph_data, ion_mode_idx, precursor_idx, adduct_idx)
        memory = self.combine_layer(torch.cat([trans_output, gnn_output], dim=-1)).unsqueeze(1)
        smiles_output, valence_penalty = self.decoder(tgt, memory, substructure_pred, tgt_mask, memory_key_padding_mask)
        fp_output = self.fp_head(memory.squeeze(1))
        return smiles_output, fp_output, valence_penalty, attn_weights, edge_weights, substructure_pred

In [23]:
# SSL Pretraining
def ssl_pretrain(model, dataloader, epochs=3, lr=1e-4):
    model.train()
    scaler = GradScaler()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=token_to_idx[PAD_TOKEN])
    for epoch in range(epochs):
        total_loss = 0
        for spectra, graph_data, smiles_tokens, masked_tokens, ion_modes, precursor_bins, adduct_indices, _ in tqdm(dataloader, desc=f"SSL Epoch {epoch+1}/{epochs}"):
            spectra = spectra.to(device)
            ion_modes = ion_modes.to(device)
            precursor_bins = precursor_bins.to(device)
            adduct_indices = adduct_indices.to(device)
            smiles_tokens = smiles_tokens.to(device)
            masked_tokens = masked_tokens.to(device)
            tgt_input = masked_tokens[:, :-1]
            tgt_output = smiles_tokens[:, 1:]
            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
            optimizer.zero_grad()
            with autocast():
                smiles_output, _, valence_penalty, _, _, _ = model(spectra, graph_data, tgt_input, ion_modes, precursor_bins, adduct_indices, tgt_mask)
                loss = criterion(smiles_output.reshape(-1, vocab_size), tgt_output.reshape(-1)) + 0.1 * valence_penalty.mean()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        print(f"SSL Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss
        }, f'ssl_checkpoint_epoch_{epoch+1}.pt')
        print(f"Saved SSL checkpoint: ssl_checkpoint_epoch_{epoch+1}.pt")


In [24]:
# Supervised Training with RL
def supervised_train(model, train_loader, val_loader, epochs=30, lr=1e-4, patience=5):
    model.train()
    scaler = GradScaler()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    smiles_criterion = nn.CrossEntropyLoss(ignore_index=token_to_idx[PAD_TOKEN])
    fp_criterion = nn.BCEWithLogitsLoss()
    mw_criterion = nn.MSELoss()
    sub_criterion = nn.BCEWithLogitsLoss()
    best_val_loss = float('inf')
    no_improve = 0

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for spectra, graph_data, smiles_tokens, ion_modes, precursor_bins, adduct_indices, raw_smiles in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            spectra = spectra.to(device)
            ion_modes = ion_modes.to(device)
            precursor_bins = precursor_bins.to(device)
            adduct_indices = adduct_indices.to(device)
            smiles_tokens = smiles_tokens.to(device)
            tgt_input = smiles_tokens[:, :-1]
            tgt_output = smiles_tokens[:, 1:]
            tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
            optimizer.zero_grad()
            with autocast():
                smiles_output, fp_output, valence_penalty, _, _, substructure_pred = model(spectra, graph_data, tgt_input, ion_modes, precursor_bins, adduct_indices, tgt_mask)
                smiles_loss = smiles_criterion(smiles_output.reshape(-1, vocab_size), tgt_output.reshape(-1))
                fp_loss = 0
                mw_loss = 0
                sub_loss = 0
                valid_count = 0
                substructure_targets = torch.zeros(len(raw_smiles), 30, dtype=torch.float, device=device)
                for i, (smiles, fp) in enumerate(zip(raw_smiles, fp_output)):
                    mol = Chem.MolFromSmiles(smiles, sanitize=True)
                    if mol:
                        true_fp = morgan_gen.GetFingerprint(mol)
                        fp_loss += fp_criterion(fp, torch.tensor([int(b) for b in true_fp.ToBitString()], dtype=torch.float, device=device))
                        mw_loss += mw_criterion(torch.tensor(Descriptors.MolWt(mol), dtype=torch.float, device=device), torch.tensor(500.0, dtype=torch.float, device=device))
                        for j, smarts in enumerate(model.gnn_encoder.substructures):
                            if mol.HasSubstructMatch(Chem.MolFromSmarts(smarts)):
                                substructure_targets[i, j] = 1
                        valid_count += 1
                fp_loss = fp_loss / valid_count if valid_count > 0 else torch.tensor(0.0, device=device)
                mw_loss = mw_loss / valid_count if valid_count > 0 else torch.tensor(0.0, device=device)
                sub_loss = sub_criterion(substructure_pred, substructure_targets)
                sigma_smiles = torch.clamp(torch.exp(model.log_sigma_smiles), 0.1, 10.0)
                sigma_fp = torch.clamp(torch.exp(model.log_sigma_fp), 0.1, 10.0)
                sigma_sub = torch.clamp(torch.exp(model.log_sigma_sub), 0.1, 10.0)
                supervised_loss = (smiles_loss / (2 * sigma_smiles**2) + model.log_sigma_smiles) + \
                                 (0.1 * fp_loss / (2 * sigma_fp**2) + model.log_sigma_fp) + \
                                 (0.1 * sub_loss / (2 * sigma_sub**2) + model.log_sigma_sub) + \
                                 0.1 * valence_penalty.mean() + 0.1 * mw_loss
                # RL component: Tanimoto reward
                rl_loss = 0
                if epoch >= 5:  # Start RL after initial training
                    pred_smiles = beam_search(model, spectra[0], graph_data[0], ion_modes[0], precursor_bins[0], adduct_indices[0], raw_smiles[0], beam_width=5, max_len=SUPERVISED_MAX_LEN, device=device)
                    if pred_smiles[0][0] != "Invalid SMILES":
                        tanimoto = tanimoto_similarity(pred_smiles[0][0], raw_smiles[0], all_fingerprints)
                        rl_loss = -torch.log(torch.tensor(tanimoto + 1e-6, device=device))
                loss = supervised_loss + 0.1 * rl_loss
            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            total_train_loss += loss.item()
        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for spectra, graph_data, smiles_tokens, ion_modes, precursor_bins, adduct_indices, raw_smiles in val_loader:
                spectra = spectra.to(device)
                ion_modes = ion_modes.to(device)
                precursor_bins = precursor_bins.to(device)
                adduct_indices = adduct_indices.to(device)
                smiles_tokens = smiles_tokens.to(device)
                tgt_input = smiles_tokens[:, :-1]
                tgt_output = smiles_tokens[:, 1:]
                tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
                with autocast():
                    smiles_output, fp_output, valence_penalty, _, _, substructure_pred = model(spectra, graph_data, tgt_input, ion_modes, precursor_bins, adduct_indices, tgt_mask)
                    smiles_loss = smiles_criterion(smiles_output.reshape(-1, vocab_size), tgt_output.reshape(-1))
                    fp_loss = 0
                    mw_loss = 0
                    sub_loss = 0
                    valid_count = 0
                    substructure_targets = torch.zeros(len(raw_smiles), 30, dtype=torch.float, device=device)
                    for i, (smiles, fp) in enumerate(zip(raw_smiles, fp_output)):
                        mol = Chem.MolFromSmiles(smiles, sanitize=True)
                        if mol:
                            true_fp = morgan_gen.GetFingerprint(mol)
                            fp_loss += fp_criterion(fp, torch.tensor([int(b) for b in true_fp.ToBitString()], dtype=torch.float, device=device))
                            mw_loss += mw_criterion(torch.tensor(Descriptors.MolWt(mol), dtype=torch.float, device=device), torch.tensor(500.0, dtype=torch.float, device=device))
                            for j, smarts in enumerate(model.gnn_encoder.substructures):
                                if mol.HasSubstructMatch(Chem.MolFromSmarts(smarts)):
                                    substructure_targets[i, j] = 1
                            valid_count += 1
                    fp_loss = fp_loss / valid_count if valid_count > 0 else torch.tensor(0.0, device=device)
                    mw_loss = mw_loss / valid_count if valid_count > 0 else torch.tensor(0.0, device=device)
                    sub_loss = sub_criterion(substructure_pred, substructure_targets)
                    sigma_smiles = torch.clamp(torch.exp(model.log_sigma_smiles), 0.1, 10.0)
                    sigma_fp = torch.clamp(torch.exp(model.log_sigma_fp), 0.1, 10.0)
                    sigma_sub = torch.clamp(torch.exp(model.log_sigma_sub), 0.1, 10.0)
                    loss = (smiles_loss / (2 * sigma_smiles**2) + model.log_sigma_smiles) + \
                           (0.1 * fp_loss / (2 * sigma_fp**2) + model.log_sigma_fp) + \
                           (0.1 * sub_loss / (2 * sigma_sub**2) + model.log_sigma_sub) + \
                           0.1 * valence_penalty.mean() + 0.1 * mw_loss
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}")
        scheduler.step(avg_val_loss)

        if (epoch + 1) % 10 == 0:
            checkpoint_path = f'checkpoint_epoch_{epoch+1}.pt'
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'val_loss': avg_val_loss,
                'token_to_idx': token_to_idx,
                'idx_to_token': idx_to_token
            }, checkpoint_path)
            print(f"Saved checkpoint: {checkpoint_path}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            no_improve = 0
            torch.save({
                'model_state_dict': model.state_dict(),
                'token_to_idx': token_to_idx,
                'idx_to_token': idx_to_token
            }, 'best_msms_hybrid.pt')
        else:
            no_improve += 1
        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

    return best_val_loss

In [25]:
# SMILES Syntax Validator
def is_valid_smiles_syntax(smiles):
    stack = []
    for c in smiles:
        if c in '([':
            stack.append(c)
        elif c == ')':
            if not stack or stack[-1] != '(':
                return False
            stack.pop()
        elif c == ']':
            if not stack or stack[-1] != '[':
                return False
            stack.pop()
    if stack:
        return False
    i = 0
    while i < len(smiles):
        if smiles[i] == '[':
            j = smiles.find(']', i)
            if j == -1:
                return False
            atom = smiles[i+1:j]
            if not any(a in atom for a in valid_atoms):
                return False
            i = j + 1
        else:
            if smiles[i] in valid_atoms or smiles[i] in '()=#/\\@.:':
                i += 1
            else:
                return False
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
        return mol is not None
    except:
        return False

In [26]:
# RDKit-based Molecular Property Filter
def is_plausible_molecule(smiles, true_mol, max_mw=1500, min_logp=-7, max_logp=7):
    mol = Chem.MolFromSmiles(smiles, sanitize=True)
    if not mol or not is_valid_smiles_syntax(smiles):
        return False
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    true_mw = Descriptors.MolWt(true_mol) if true_mol else 500
    return mw <= max_mw and min_logp <= logp <= max_logp and abs(mw - true_mw) < 300

# Evaluation Metrics
def dice_similarity(smiles1, smiles2):
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    if mol1 and mol2:
        fp1 = morgan_gen.GetFingerprint(mol1)
        fp2 = morgan_gen.GetFingerprint(mol2)
        return DataStructs.DiceSimilarity(fp1, fp2)
    return 0.0

def mcs_similarity(true_smiles, pred_smiles):
    mol1 = Chem.MolFromSmiles(true_smiles)
    mol2 = Chem.MolFromSmiles(pred_smiles)
    if mol1 and mol2:
        mcs = rdFMCS.FindMCS([mol1, mol2], timeout=30)
        return mcs.numAtoms / max(mol1.GetNumAtoms(), mol2.GetNumAtoms())
    return 0.0

def mw_difference(true_smiles, pred_smiles):
    mol1 = Chem.MolFromSmiles(true_smiles)
    mol2 = Chem.MolFromSmiles(pred_smiles)
    if mol1 and mol2:
        return abs(Descriptors.MolWt(mol1) - Descriptors.MolWt(mol2))
    return float('inf')

def logp_difference(true_smiles, pred_smiles):
    mol1 = Chem.MolFromSmiles(true_smiles)
    mol2 = Chem.MolFromSmiles(pred_smiles)
    if mol1 and mol2:
        return abs(Descriptors.MolLogP(mol1) - Descriptors.MolLogP(mol2))
    return float('inf')

def substructure_match(true_smiles, pred_smiles, substructures):
    mol1 = Chem.MolFromSmiles(true_smiles)
    mol2 = Chem.MolFromSmiles(pred_smiles)
    if not mol1 or not mol2:
        return 0
    matches = 0
    for smarts in substructures:
        pattern = Chem.MolFromSmarts(smarts)
        if mol1.HasSubstructMatch(pattern) and mol2.HasSubstructMatch(pattern):
            matches += 1
    return matches / len(substructures)

def validity_rate(pred_smiles_list):
    valid = sum(1 for smiles in pred_smiles_list if Chem.MolFromSmiles(smiles, sanitize=True) is not None)
    return valid / len(pred_smiles_list) * 100

def tanimoto_similarity(smiles1, smiles2, precomputed_fps=None):
    mol1 = Chem.MolFromSmiles(smiles1, sanitize=True)
    if not mol1:
        return 0.0
    fp1 = morgan_gen.GetFingerprint(mol1)
    if precomputed_fps and smiles2 in precomputed_fps:
        fp2 = precomputed_fps[smiles2]
    else:
        mol2 = Chem.MolFromSmiles(smiles2, sanitize=True)
        if not mol2:
            return 0.0
        fp2 = morgan_gen.GetFingerprint(mol2)
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def prediction_diversity(pred_smiles_list):
    if len(pred_smiles_list) < 2:
        return 0.0
    total_tanimoto = 0
    count = 0
    for i in range(len(pred_smiles_list)):
        for j in range(i+1, len(pred_smiles_list)):
            total_tanimoto += tanimoto_similarity(pred_smiles_list[i], pred_smiles_list[j])
            count += 1
    return 1 - (total_tanimoto / count) if count > 0 else 0.0

In [27]:
# Beam Search with Stereochemistry
def beam_search(model, spectrum, graph_data, ion_mode_idx, precursor_idx, adduct_idx, true_smiles, beam_width=10, max_len=150, nucleus_p=0.9, device='cpu'):
    model.eval()
    true_mol = Chem.MolFromSmiles(true_smiles) if true_smiles else None
    with torch.no_grad():
        spectrum = spectrum.unsqueeze(0).to(device)
        graph_data = Batch.from_data_list([graph_data]).to(device)
        ion_mode_idx = torch.tensor([ion_mode_idx], dtype=torch.long).to(device)
        precursor_idx = torch.tensor([precursor_idx], dtype=torch.long).to(device)
        adduct_idx = torch.tensor([adduct_idx], dtype=torch.long).to(device)
        memory = model.transformer_encoder(spectrum, ion_mode_idx, precursor_idx, adduct_idx)[0]
        gnn_output, substructure_pred, _ = model.gnn_encoder(graph_data, ion_mode_idx, precursor_idx, adduct_idx)
        memory = model.combine_layer(torch.cat([memory, gnn_output], dim=-1)).unsqueeze(1)
        sequences = [([token_to_idx[SOS_TOKEN]], 0.0)]

        for _ in range(max_len):
            all_candidates = []
            for seq, score in sequences:
                if seq[-1] == token_to_idx[EOS_TOKEN]:
                    all_candidates.append((seq, score))
                    continue
                partial_smiles = ''.join([idx_to_token.get(idx, '') for idx in seq[1:]])
                if not is_valid_smiles_syntax(partial_smiles):
                    continue
                tgt_input = torch.tensor([seq], dtype=torch.long).to(device)
                tgt_mask = model.generate_square_subsequent_mask(len(seq)).to(device)
                outputs, valence_penalty = model.decoder(tgt_input, memory, substructure_pred, tgt_mask)
                log_probs = F.log_softmax(outputs[0, -1], dim=-1).cpu().numpy() - 0.1 * valence_penalty.cpu().numpy()
                # Boost stereochemistry tokens
                for tok in ['@', '/']:
                    if tok in token_to_idx:
                        log_probs[token_to_idx[tok]] += 0.5
                sorted_probs = np.sort(np.exp(log_probs))[::-1]
                cumulative_probs = np.cumsum(sorted_probs)
                cutoff_idx = np.searchsorted(cumulative_probs, nucleus_p)
                top_tokens = np.argsort(log_probs)[-cutoff_idx:] if cutoff_idx > 0 else np.argsort(log_probs)[-1:]
                top_probs = np.exp(log_probs[top_tokens]) / np.sum(np.exp(log_probs[top_tokens]))
                for tok in np.random.choice(top_tokens, size=min(beam_width, len(top_tokens)), p=top_probs):
                    new_smiles = partial_smiles + idx_to_token.get(int(tok), '')
                    if is_valid_smiles_syntax(new_smiles):
                        diversity_penalty = 0.2 * sum(1 for s, _ in sequences if tok in s[1:-1])
                        all_candidates.append((seq + [int(tok)], score + log_probs[tok] - diversity_penalty))
            sequences = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]
            if all(seq[-1] == token_to_idx[EOS_TOKEN] for seq, _ in sequences):
                break

        results = []
        for seq, score in sequences:
            smiles = ''.join([idx_to_token.get(idx, '') for idx in seq[1:-1]])
            try:
                mol = Chem.MolFromSmiles(smiles, sanitize=True)
                if mol and is_plausible_molecule(smiles, true_mol):
                    smiles = Chem.MolToSmiles(mol, canonical=True, doRandom=True)
                    confidence = np.exp(score / len(seq))
                    results.append((smiles, confidence))
            except:
                continue
        return results if results else [("Invalid SMILES", 0.0)]


In [28]:
# Visualization Functions
def plot_attention_weights(attn_weights, title="Transformer Attention Weights"):
    plt.figure(figsize=(10, 8))
    plt.imshow(attn_weights.squeeze().cpu().numpy(), cmap='viridis')
    plt.colorbar()
    plt.title(title)
    plt.xlabel("Key Tokens")
    plt.ylabel("Query Tokens")
    plt.show()

def plot_gnn_edge_weights(edge_weights, edge_index, title="GNN Edge Importance"):
    edge_scores = edge_weights[-1].cpu().numpy()
    plt.figure(figsize=(10, 8))
    plt.hist(edge_scores, bins=50)
    plt.title(title)
    plt.xlabel("Edge Weight Magnitude")
    plt.ylabel("Frequency")
    plt.show()

# Error Analysis
def error_analysis(pred_smiles_list, true_smiles_list, adducts, precomputed_fps):
    errors = {'small': 0, 'large': 0, 'aromatic': 0, 'aliphatic': 0}
    adduct_errors = {adduct: [] for adduct in adduct_types}
    for pred_smiles, true_smiles, adduct in zip(pred_smiles_list, true_smiles_list, adducts):
        tanimoto = tanimoto_similarity(pred_smiles, true_smiles, precomputed_fps)
        if tanimoto < 0.3:
            mol = Chem.MolFromSmiles(true_smiles)
            if mol:
                mw = Descriptors.MolWt(mol)
                is_aromatic = any(atom.GetIsAromatic() for atom in mol.GetAtoms())
                errors['small' if mw < 300 else 'large'] += 1
                errors['aromatic' if is_aromatic else 'aliphatic'] += 1
                adduct_errors[adduct].append(tanimoto)
    print("Error Analysis:")
    print(f"Small molecules (<300 Da) errors: {errors['small']}")
    print(f"Large molecules (≥300 Da) errors: {errors['large']}")
    print(f"Aromatic molecule errors: {errors['aromatic']}")
    print(f"Aliphatic molecule errors: {errors['aliphatic']}")
    for adduct, scores in adduct_errors.items():
        if scores:
            print(f"Adduct {adduct} - Avg Tanimoto: {np.mean(scores):.4f}, Count: {len(scores)}")

In [29]:
# Hyperparameter Tuning
def objective(trial, train_data, val_data):
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    train_dataset = MSMSDataset(train_data, max_len=SUPERVISED_MAX_LEN, is_ssl=False)
    val_dataset = MSMSDataset(val_data, max_len=SUPERVISED_MAX_LEN, is_ssl=False)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=32, num_workers=2)
    model = MSMS2SmilesHybrid(vocab_size=vocab_size, d_model=768, nhead=12, num_layers=8, dim_feedforward=2048, dropout=0.2, fp_size=2048).to(device)
    return supervised_train(model, train_loader, val_loader, epochs=10, lr=lr)

In [None]:
# Cross-Validation and Training (No changes needed here)
import torch
import pickle
import numpy as np
import pandas as pd
import time
from torch_geometric.loader import DataLoader
from sklearn.model_selection import KFold
from tqdm import tqdm
import glob
import gc
import os

# --- Setup ---
print("--- Initializing Setup ---")
df_massspecgym_sample = pd.read_parquet("df_massspecgym.parquet", columns=["adduct"])
adduct_types = df_massspecgym_sample['adduct'].unique()
del df_massspecgym_sample
print(f"Adduct types loaded successfully. Count: {len(adduct_types)}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Get chunk files and set up K-Fold ---
print("--- Setting up K-Fold Cross-Validation ---")
processed_chunk_dir = "processed_chunks"
graph_chunk_dir = "/media/onepaw/seagate_manual/graph_data_chunks"
processed_files = np.array(sorted(glob.glob(os.path.join(processed_chunk_dir, "*.parquet"))))
graph_files = np.array(sorted(glob.glob(os.path.join(graph_chunk_dir, "*.pkl"))))
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []
print(f"Found {len(processed_files)} chunk file pairs for 5-fold cross-validation.")

# --- Outer Loop: Folds ---
for fold_idx, (train_indices, val_indices) in enumerate(kf.split(processed_files)):
    print(f"\n{'='*20} FOLD {fold_idx + 1}/5 {'='*20}")
    train_proc_files, train_graph_files = processed_files[train_indices], graph_files[train_indices]
    val_proc_files, val_graph_files = processed_files[val_indices], graph_files[val_indices]

    # --- Initialize Model and Training Components ---
    print(f"Initializing model for Fold {fold_idx + 1}...")
    learning_rate = 1e-4
    model = MSMS2SmilesHybrid(vocab_size=vocab_size, d_model=768, nhead=12, num_layers=8, dim_feedforward=2048, dropout=0.2, fp_size=2048).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    scaler = torch.cuda.amp.GradScaler()
    smiles_criterion = torch.nn.CrossEntropyLoss(ignore_index=token_to_idx[PAD_TOKEN])
    best_val_loss, epochs_no_improve, patience = float('inf'), 0, 5

    # --- Inner Loop: Epochs ---
    for epoch in range(30):
        print(f"\n--- Epoch {epoch + 1}/30 ---")
        model.train()
        total_train_loss, train_batches = 0, 0
        for proc_file, graph_file in tqdm(zip(train_proc_files, train_graph_files), total=len(train_proc_files), desc=f"Epoch {epoch+1} Training"):
            try:
                df_chunk = pd.read_parquet(proc_file)
                with open(graph_file, 'rb') as f:
                    graph_data_chunk = pickle.load(f)
                train_dataset = MSMSDataset(df_chunk, graph_data_chunk, max_len=SUPERVISED_MAX_LEN)
                train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

                for spectra, graph_data, smiles_tokens, ion_modes, precursor_bins, adduct_indices, _ in train_loader:
                    graph_data = graph_data.to(device)
                    spectra, smiles_tokens = spectra.to(device), smiles_tokens.to(device)
                    ion_modes, precursor_bins, adduct_indices = ion_modes.to(device), precursor_bins.to(device), adduct_indices.to(device)
                    tgt_input, tgt_output = smiles_tokens[:, :-1], smiles_tokens[:, 1:]
                    tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1))

                    optimizer.zero_grad(set_to_none=True)
                    with torch.cuda.amp.autocast():
                        smiles_output, _, valence_penalty, _, _, _ = model(spectra, graph_data, tgt_input, ion_modes, precursor_bins, adduct_indices, tgt_mask)
                        loss = smiles_criterion(smiles_output.reshape(-1, vocab_size), tgt_output.reshape(-1)) + 0.1 * valence_penalty.mean()

                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                    total_train_loss += loss.item()
                    train_batches += 1
                del df_chunk, graph_data_chunk, train_dataset, train_loader; gc.collect()
            except Exception as e:
                print(f"ERROR during training on {os.path.basename(proc_file)}: {e}")
                import traceback
                traceback.print_exc()
                continue
        
        avg_train_loss = total_train_loss / train_batches if train_batches > 0 else float('nan')

        # --- VALIDATION PHASE ---
        model.eval()
        total_val_loss, val_batches = 0, 0
        with torch.no_grad():
            for proc_file, graph_file in tqdm(zip(val_proc_files, val_graph_files), total=len(val_proc_files), desc=f"Epoch {epoch+1} Validation"):
                try:
                    df_chunk = pd.read_parquet(proc_file)
                    with open(graph_file, 'rb') as f:
                        graph_data_chunk = pickle.load(f)
                    val_dataset = MSMSDataset(df_chunk, graph_data_chunk, max_len=SUPERVISED_MAX_LEN)
                    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)
                    for spectra, graph_data, smiles_tokens, ion_modes, precursor_bins, adduct_indices, _ in val_loader:
                        graph_data, spectra, smiles_tokens = graph_data.to(device), spectra.to(device), smiles_tokens.to(device)
                        ion_modes, precursor_bins, adduct_indices = ion_modes.to(device), precursor_bins.to(device), adduct_indices.to(device)
                        tgt_input, tgt_output = smiles_tokens[:, :-1], smiles_tokens[:, 1:]
                        tgt_mask = model.generate_square_subsequent_mask(tgt_input.size(1))
                        with torch.cuda.amp.autocast():
                            smiles_output, _, valence_penalty, _, _, _ = model(spectra, graph_data, tgt_input, ion_modes, precursor_bins, adduct_indices, tgt_mask)
                            loss = smiles_criterion(smiles_output.reshape(-1, vocab_size), tgt_output.reshape(-1)) + 0.1 * valence_penalty.mean()
                        total_val_loss += loss.item()
                        val_batches += 1
                    del df_chunk, graph_data_chunk, val_dataset, val_loader; gc.collect()
                except Exception as e:
                    print(f"ERROR during validation on {os.path.basename(proc_file)}: {e}")
                    continue
        
        avg_val_loss = total_val_loss / val_batches if val_batches > 0 else float('nan')
        scheduler.step(avg_val_loss)

        print(f"Epoch {epoch+1} Summary | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | LR: {optimizer.param_groups[0]['lr']:.6f}")
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            print(f"✨ New best validation loss: {best_val_loss:.4f}. Saving model for Fold {fold_idx + 1}...")
            torch.save({'model_state_dict': model.state_dict(), 'token_to_idx': token_to_idx, 'idx_to_token': idx_to_token}, f"best_model_fold_{fold_idx+1}.pt")
        else:
            epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"🛑 Early stopping triggered after {epoch + 1} epochs.")
            break
    
    fold_results.append(best_val_loss)
    print("--- COMPLETED ONE FOLD. Breaking for now. ---")
    break

print(f"\n{'='*20} TRAINING COMPLETE {'='*20}")
if fold_results:
    print(f"✅ Cross-validation results (best val loss per fold): {fold_results}")
    print(f"📊 Average best validation loss: {np.mean(fold_results):.4f}")

--- Initializing Setup ---
Adduct types loaded successfully. Count: 2
Using device: cuda
--- Setting up K-Fold Cross-Validation ---
Found 194 chunk file pairs for 5-fold cross-validation.

Initializing model for Fold 1...


  scaler = torch.cuda.amp.GradScaler()



--- Epoch 1/30 ---


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


In [None]:


# External Dataset Evaluation
model.eval()
external_metrics = {'tanimoto': [], 'dice': [], 'mcs': [], 'mw_diff': [], 'logp_diff': [], 'substructure': []}
pred_smiles_list = []
true_smiles_list = []
adducts_list = []
num_samples = min(5, len(external_dataset))

for sample_idx in range(num_samples):
    sample_spectrum = external_dataset[sample_idx][0]
    sample_graph = external_dataset[sample_idx][1]
    sample_ion_mode = external_dataset[sample_idx][3]
    sample_precursor_bin = external_dataset[sample_idx][4]
    sample_adduct_idx = external_dataset[sample_idx][5]
    true_smiles = external_dataset[sample_idx][6]

    predicted_results = beam_search(model, sample_spectrum, sample_graph, sample_ion_mode, sample_precursor_bin, sample_adduct_idx, true_smiles, beam_width=10, max_len=SUPERVISED_MAX_LEN, device=device)
    pred_smiles_list.extend([smiles for smiles, _ in predicted_results])
    true_smiles_list.extend([true_smiles] * len(predicted_results))
    adducts_list.extend([df_external.iloc[sample_idx]['adduct']] * len(predicted_results))

    print(f"\nExternal Sample {sample_idx} - True SMILES: {true_smiles}")
    print("Top Predicted SMILES:")
    for smiles, confidence in predicted_results[:3]:
        external_metrics['tanimoto'].append(tanimoto_similarity(smiles, true_smiles, all_fingerprints))
        external_metrics['dice'].append(dice_similarity(smiles, true_smiles))
        external_metrics['mcs'].append(mcs_similarity(smiles, true_smiles))
        external_metrics['mw_diff'].append(mw_difference(smiles, true_smiles))
        external_metrics['logp_diff'].append(logp_difference(smiles, true_smiles))
        external_metrics['substructure'].append(substructure_match(smiles, true_smiles, model.gnn_encoder.substructures))
        print(f"SMILES: {smiles}, Confidence: {confidence:.4f}, Tanimoto: {external_metrics['tanimoto'][-1]:.4f}, Dice: {external_metrics['dice'][-1]:.4f}, MCS: {external_metrics['mcs'][-1]:.4f}")
        if len(smiles) > 100 and smiles.count('C') > len(smiles) * 0.8:
            print("Warning: Predicted SMILES is a long carbon chain, indicating potential model underfitting.")
        if smiles != "Invalid SMILES":
            mol = Chem.MolFromSmiles(smiles, sanitize=True)
            if mol:
                print(f"Molecular Weight: {Descriptors.MolWt(mol):.2f}, LogP: {Descriptors.MolLogP(mol):.2f}")

    # Visualize molecules
    if predicted_results[0][0] != "Invalid SMILES":
        pred_mol = Chem.MolFromSmiles(predicted_results[0][0], sanitize=True)
        true_mol = Chem.MolFromSmiles(true_smiles, sanitize=True)
        if pred_mol and true_mol:
            img = Draw.MolsToGridImage([true_mol, pred_mol], molsPerRow=2, subImgSize=(300, 300), legends=['True', 'Predicted'])
            img_array = np.array(img.convert('RGB'))
            plt.figure(figsize=(10, 5))
            plt.imshow(img_array)
            plt.axis('off')
            plt.title(f"External Sample {sample_idx} - Tanimoto: {external_metrics['tanimoto'][0]:.4f}")
            plt.show()

    # Visualize attention and GNN weights for first sample
    if sample_idx == 0:
        with torch.no_grad():
            spectrum = sample_spectrum.unsqueeze(0).to(device)
            graph_data = Batch.from_data_list([sample_graph]).to(device)
            ion_mode_idx = torch.tensor([sample_ion_mode], dtype=torch.long).to(device)
            precursor_idx = torch.tensor([sample_precursor_bin], dtype=torch.long).to(device)
            adduct_idx = torch.tensor([sample_adduct_idx], dtype=torch.long).to(device)
            _, attn_weights = model.transformer_encoder(spectrum, ion_mode_idx, precursor_idx, adduct_idx)
            _, _, edge_weights = model.gnn_encoder(graph_data, ion_mode_idx, precursor_idx, adduct_idx)
            plot_attention_weights(attn_weights, title=f"External Fold Transformer Attention Weights")
            plot_gnn_edge_weights(edge_weights, sample_graph.edge_index, title=f"External Fold GNN Edge Importance")

# Final Evaluation
print(f"External Validity Rate: {validity_rate(pred_smiles_list):.2f}%")
print(f"External Prediction Diversity: {prediction_diversity(pred_smiles_list):.4f}")
print("External Metrics Summary:")
print(f"Avg Tanimoto: {np.mean(external_metrics['tanimoto']):.4f}")
print(f"Avg Dice: {np.mean(external_metrics['dice']):.4f}")
print(f"Avg MCS: {np.mean(external_metrics['mcs']):.4f}")
print(f"Avg MW Difference: {np.mean([x for x in external_metrics['mw_diff'] if x != float('inf')]):.2f}")
print(f"Avg LogP Difference: {np.mean([x for x in external_metrics['logp_diff'] if x != float('inf')]):.2f}")
print(f"Avg Substructure Match: {np.mean(external_metrics['substructure']):.4f}")
error_analysis(pred_smiles_list, true_smiles_list, adducts_list, all_fingerprints)
