## Load your libraries (LYL)

In [4]:
import sys
sys.path.append('../src')

import dataset, molecular_representation, config, utils, model
from dataset import QM9Dataset, LogSDataset, LogPDataset, FreeSolvDataset, ESOLDataset
import numpy as np
import pandas as pd
from utils import *
import torch
from tqdm import tqdm
from model import Autoencoder # Simply importing the autoencoder model module from the model.py file
from model import GNN3D

import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage

import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Split your dataset (SYD)

In [None]:
# Load dataset from CSV
dataset = pd.read_csv("../data/logs/logs.csv")

# Split dataset into train and test
train_dataset, test_dataset = split_dataset(dataset, 0.9)

# Write train_dataset and test_dataset to CSV files
train_dataset.to_csv("../data/logs/train.csv", index=False)
test_dataset.to_csv("../data/logs/test.csv", index=False)

print("Train and test datasets saved successfully.")

## Process your data (PYD)

In [None]:
train_samples = LogSDataset("../data/logs/train")
print(train_samples)
print("===================================")
test_samples = LogSDataset("../data/logs/test")
print(test_samples)

## Know your features (KYF)

In [None]:
# Printing out the dimensions of all of these features with a description of what each feature is
print(f"Atomic Features: {(train_samples[0])[0].shape} - This represents the atomic features of the molecule")
print(f"Bond Features: {(train_samples[0])[1].shape} - This represents the bond features of the molecule")
print(f"Angle Features: {(train_samples[0])[2].shape} - This represents the angle features of the molecule")
print(f"Dihedral Features: {(train_samples[0])[3].shape} - This represents the dihedral features of the molecule")
print(f"Global Molecular Features: {(train_samples[0])[4].shape} - This represents the global molecular features of the molecule")
print(f"Bond Indices: {(train_samples[0])[5].shape} - This represents the bond indices of the molecule")
print(f"Angle Indices: {(train_samples[0])[6].shape} - This represents the angle indices of the molecule")
print(f"Dihedral Indices: {(train_samples[0])[7].shape} - This represents the dihedral indices of the molecule")
print(f"Target: {(train_samples[0])[8].shape} - This represents the target of the molecule")

## Know your modules (KYM)

In [None]:
def train_gnn3d_vary_dataset_size(train_samples, min_samples=10, max_samples=45, divisions=3, save_dir="./models/vary_train_size/", dataset_name="logs"):
    """Train GNN3D for varying training set size"""
    # Create the directory if it does not exist
    os.makedirs(save_dir, exist_ok=True)

    # Autoencoder Training Starts
    """Train Autoencoder"""
    atom_autoencoder = Autoencoder(154, 10).to(device)
    bond_autoencoder = Autoencoder(10, 3).to(device)
    mse_loss_fn = torch.nn.MSELoss()
    atom_optimizer = torch.optim.Adam(atom_autoencoder.parameters())
    bond_optimizer = torch.optim.Adam(bond_autoencoder.parameters())
    
    for epoch_i in range(10):
        avg_atom_rmse_loss = 0
        avg_bond_rmse_loss = 0
        total_samples = 0
        
        for i, molecule in enumerate(train_samples):
            atom_features = molecule[0].to(device)
            bond_features = molecule[1].to(device)
            
            # Forward pass
            atom_features_reconstructed = atom_autoencoder(atom_features)
            bond_features_reconstructed = bond_autoencoder(bond_features)
            
            # Calculating loss
            atom_loss = mse_loss_fn(atom_features_reconstructed, atom_features)
            bond_loss = mse_loss_fn(bond_features_reconstructed, bond_features)
            
            # Backward pass and optimization step
            atom_optimizer.zero_grad()
            bond_optimizer.zero_grad()
            atom_loss.backward()
            bond_loss.backward()
            atom_optimizer.step()
            bond_optimizer.step()
            
            # Calculating average loss
            avg_atom_rmse_loss = (avg_atom_rmse_loss * total_samples + (atom_loss.item() ** 0.5)) / (total_samples + 1)
            avg_bond_rmse_loss = (avg_bond_rmse_loss * total_samples + (bond_loss.item() ** 0.5)) / (total_samples + 1)           
            total_samples += 1
    # Autoencoder Training Ends

    
    # Determine step size for each division
    sample_step_size = int((max_samples - min_samples) / divisions) + 1

    # Dictionary to store RMSE and MSE for each division
    gnn_rmse_dict = {}

    # Loop through divisions
    for division_i in range(divisions + 1):
        # Initialize training components for each division
        # Aond and bond autoencoders' dimensions to be changed for a new dataset
        atom_autoencoder = Autoencoder(154, 10).to(device)
        bond_autoencoder = Autoencoder(10, 3).to(device)
        mse_loss_fn = torch.nn.MSELoss()
        gnn3d = GNN3D(atomic_vector_size=10, bond_vector_size=3, number_of_molecular_features=200, number_of_targets=1).to(device)
        gnn_optimizer = torch.optim.Adam(gnn3d.parameters())

        # Average losses initialization
        avg_rmse = 0
        avg_mse = 0

        # Current size of the training set for this division
        current_size = min_samples + sample_step_size * division_i

        # Initialize dictionary entry for current division
        gnn_rmse_dict["d" + str(current_size)] = {"avg_rmse": [], "avg_mse": []}

        # Training loop for current division
        for epoch_i in tqdm(range(len(train_samples)), desc=f"Division {division_i + 1}/{divisions + 1}"):
            if epoch_i >= current_size:
                break

            molecule = train_samples[epoch_i]
            target = molecule[8].to(device)

            input_representation = [
                atom_autoencoder.encode(molecule[0].to(device)),
                bond_autoencoder.encode(molecule[1].to(device)),
                molecule[2].to(device),
                molecule[3].to(device),
                molecule[4].to(device),
                molecule[5].to(device),
                molecule[6].to(device),
                molecule[7].to(device)
            ]

            # Forward pass
            prediction = gnn3d(input_representation)

            # Compute loss
            loss = mse_loss_fn(prediction, target)

            # Backward pass and optimization step
            gnn_optimizer.zero_grad()
            loss.backward()
            gnn_optimizer.step()

            # Update average RMSE and MSE
            avg_rmse = (avg_rmse * epoch_i + torch.sqrt(loss).item()) / (epoch_i + 1)
            avg_mse = (avg_mse * epoch_i + loss.item()) / (epoch_i + 1)
            gnn_rmse_dict["d" + str(current_size)]["avg_rmse"].append(avg_rmse)
            gnn_rmse_dict["d" + str(current_size)]["avg_mse"].append(avg_mse)

            # Print progress every 10 epochs
            if (epoch_i % 10 == 0):
                tqdm.write(f"Epoch: {epoch_i + 1:>4}/{current_size:>3} | Avg. RMSE Loss: {avg_rmse:.4f} | Avg. MSE Loss: {loss.item():.4f} | target: {target.item():.4f} | pred: {prediction.item():.4f}")

        # Save model state after each division
        torch.save(gnn3d.state_dict(), f"{save_dir}gnn3d_{dataset_name}_div{current_size}.pth")

    return gnn_rmse_dict

## Know your training loss for varying dataset size

In [None]:
gnn_rmse_dict= train_gnn3d_vary_dataset_size(train_samples, min_samples=278, max_samples=1178, divisions=5, save_dir="./models/logs/vary_train_size/", dataset_name="logs")

## Know your plots for training loss vs. training set size

In [None]:
# Create the directory if it does not exist
plot_dir = "./models/logs/vary_train_size/plots/"
os.makedirs(plot_dir, exist_ok=True)
dataset_name="logs"

# Plot all divisions on the same plot
plt.figure(figsize=(10, 6))  # Adjust figure size as needed

for key in gnn_rmse_dict.keys():
    plt.plot(gnn_rmse_dict[key]["avg_rmse"], label=f"dataset size = {key[1:]}")

plt.legend()
plt.title("GNN3D RMSE Loss for Different Dataset Sizes")
plt.xlabel("Epoch")
plt.ylabel("RMSE Loss")
plt.grid(True)
plt.tight_layout()

# Save the plot
plt.savefig(f"{plot_dir}gnn3d_{dataset_name}_all_divisions.svg", format='svg')
plt.show()