In [None]:
!pip install torch pandas numpy h5py tqdm

In [11]:
%load_ext autoreload
%autoreload 2

import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import h5py
import numpy as np
from sklearn.model_selection import train_test_split
from src.dataset import ProteinDataset
from src.model import ChemicalShiftsPredictor
from src.utils import train_model, test_model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Load and prepare data
csv_file = 'data/disorder/strict.csv'
prott5_file = 'data/disorder/embeddings/unfiltered_all_prott5.h5'
prott5_res_file = 'data/disorder/embeddings/unfiltered_all_prott5_res.h5'
prostt5_file = 'data/disorder/embeddings/prostt5.h5'
chemical_shifts_df = pd.read_csv(csv_file)
chemical_shifts_df[['C', 'CA', 'CB', 'HA', 'H', 'N', 'HB']] = chemical_shifts_df[['C', 'CA', 'CB', 'HA', 'H', 'N', 'HB']].fillna(0)

# Split data into train, validation, and test sets
train_df, test_df = train_test_split(chemical_shifts_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2
# Reset the index for each split DataFrame
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Create datasets
train_dataset = ProteinDataset(train_df, prott5_file, prott5_res_file, prostt5_file)
val_dataset = ProteinDataset(val_df, prott5_file, prott5_res_file, prostt5_file)
test_dataset = ProteinDataset(test_df, prott5_file, prott5_res_file, prostt5_file)

In [None]:
train_dataset[0]

In [12]:
print('Trainng dataset length:', len(train_dataset))
print('Validation dataset length:', len(val_dataset))
print('Test dataset length:', len(test_dataset))

Trainng dataset length: 130495
Validation dataset length: 43499
Test dataset length: 43499


In [13]:
# Assuming you have defined model, train_loader, val_loader, test_loader
learning_rate = 0.001
weight_decay = 1e-5
patience = 10
batch_size = 2048

trained_model = train_model(train_dataset, val_dataset, learning_rate=0.001, weight_decay=1e-5, patience=10, batch_size=batch_size, use_prostt5=False, use_protein_mean=False)
test_model(trained_model, test_dataset, batch_size=batch_size)

Epoch 1/100:   0%|          | 0/4078 [00:00<?, ?batch/s]

Epoch 1/100:  12%|█▏        | 477/4078 [00:10<01:21, 44.08batch/s]


KeyboardInterrupt: 