In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from src.dataset import ProteinDataset
from src.utils import train_model, test_model
import torch
from src.model import ChemicalShiftsPredictor, ChemicalShiftsPredictorAttention
from src.utils import packed_padded_collate

from tqdm.notebook import tqdm
from catboost import CatBoostRegressor

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import h5py

In [16]:
# Load and prepare data
csv_file = 'data/strict.csv'
prott5_file = 'data/embeddings/unfiltered_all_prott5.h5'
prott5_res_file = 'data/embeddings/unfiltered_all_prott5_res.h5'
prostt5_file = 'data/embeddings/prostt5.h5'
esm_file = 'data/embeddings/unfiltered_all_esm2_3b.h5'
esm_res_file = 'data/embeddings/unfiltered_all_esm2_3b_res.h5'
chemical_shifts_df = pd.read_csv(csv_file)
#chemical_shifts_df.describe()

In [3]:
test_ids = []
with open("pdb_matched/final_test_ids.txt", "r") as f:
    for line in f:
        test_ids.append(line.strip())

In [4]:
chemical_shifts_df = chemical_shifts_df[~chemical_shifts_df['ID'].isin(test_ids)]

In [5]:
chemical_shifts_df.dropna(inplace=True, subset=["N"])

In [6]:
# split dataset into train and test, based on unique protein IDs
ids = chemical_shifts_df['ID'].unique()
train_ids, test_ids = train_test_split(ids, test_size=0.2, random_state=42)
train_df = chemical_shifts_df[chemical_shifts_df['ID'].isin(train_ids)]
val_df = chemical_shifts_df[chemical_shifts_df['ID'].isin(test_ids)]

In [7]:
prostt5_embs = h5py.File(prostt5_file, "r")
prott5_embs = h5py.File(prott5_file, "r")

In [8]:
# get prosst5 embeddings for train and test
train_X = []
for row in tqdm(train_df.itertuples(), total=len(train_df)):
    protein_id = row.ID
    amino_acid_index = row.seq_index - 1  # Adjust for zero-based indexing
    amino_acid_prostt5_emb = prostt5_embs[protein_id][amino_acid_index]
    
    prot_emb = prott5_embs[protein_id]
    
    train_X.append(np.concatenate([amino_acid_prostt5_emb.flatten(), np.array(prot_emb).flatten()], axis=0))

  0%|          | 0/144973 [00:00<?, ?it/s]

In [9]:
val_X = []
for row in tqdm(val_df.itertuples(), total=len(val_df)):
    protein_id = row.ID
    amino_acid_index = row.seq_index - 1  # Adjust for zero-based indexing
    amino_acid_prostt5_emb = prostt5_embs[protein_id][amino_acid_index]
    
    prot_emb = prott5_embs[protein_id]
    
    val_X.append(np.concatenate([amino_acid_prostt5_emb.flatten(), np.array(prot_emb).flatten()], axis=0))


  0%|          | 0/35552 [00:00<?, ?it/s]

In [10]:
train_y = list(train_df['N'].values)
val_y = list(val_df['N'].values)

In [11]:
# model = CatBoostRegressor(iterations=2000,
#                            task_type="GPU", loss_function='RMSEWithUncertainty', posterior_sampling=True)
model = CatBoostRegressor(iterations=5000, task_type="GPU")
model.fit(train_X + val_X, train_y + val_y, verbose=100)

Learning rate set to 0.030523
0:	learn: 5.2975633	total: 73.3ms	remaining: 6m 6s
100:	learn: 4.1637325	total: 5.71s	remaining: 4m 37s
200:	learn: 3.9110293	total: 11.1s	remaining: 4m 25s
300:	learn: 3.7702555	total: 16.4s	remaining: 4m 15s
400:	learn: 3.6732836	total: 21.6s	remaining: 4m 7s
500:	learn: 3.5970768	total: 26.6s	remaining: 3m 59s
600:	learn: 3.5351912	total: 31.8s	remaining: 3m 52s
700:	learn: 3.4825807	total: 36.7s	remaining: 3m 45s
800:	learn: 3.4366652	total: 41.7s	remaining: 3m 38s
900:	learn: 3.3953289	total: 46.6s	remaining: 3m 31s
1000:	learn: 3.3588755	total: 51.4s	remaining: 3m 25s
1100:	learn: 3.3235910	total: 56.2s	remaining: 3m 19s
1200:	learn: 3.2919361	total: 1m 1s	remaining: 3m 13s
1300:	learn: 3.2614805	total: 1m 6s	remaining: 3m 7s
1400:	learn: 3.2341506	total: 1m 11s	remaining: 3m 2s
1500:	learn: 3.2084596	total: 1m 15s	remaining: 2m 56s
1600:	learn: 3.1834049	total: 1m 20s	remaining: 2m 51s
1700:	learn: 3.1594728	total: 1m 25s	remaining: 2m 46s
1800:	lea

<catboost.core.CatBoostRegressor at 0x7f5c85ae18a0>

In [12]:
model.save_model("models/n_catboost_prostt5_5000iter.cbm")

In [17]:
# load catboost model
model = CatBoostRegressor()
model.load_model("models/n_catboost_prostt5_5000iter.cbm")

<catboost.core.CatBoostRegressor at 0x7f5c84529ea0>

In [18]:
test_ids = []
with open("pdb_matched/final_test_ids.txt", "r") as f:
    for line in f:
        test_ids.append(line.strip())

In [19]:
chemical_shifts_df = pd.read_csv(csv_file)
# take test ids
chemical_shifts_df = chemical_shifts_df[chemical_shifts_df['ID'].isin(test_ids)]

#test on them
test_X = []
for row in tqdm(chemical_shifts_df.itertuples(), total=len(chemical_shifts_df)):
    protein_id = row.ID
    amino_acid_index = row.seq_index - 1  # Adjust for zero-based indexing
    amino_acid_prostt5_emb = prostt5_embs[protein_id][amino_acid_index]
    
    prot_emb = prott5_embs[protein_id]
    
    test_X.append(np.concatenate([amino_acid_prostt5_emb.flatten(), np.array(prot_emb).flatten()], axis=0))

test_y = list(chemical_shifts_df['N'].values)

preds = model.predict(test_X)

  0%|          | 0/12892 [00:00<?, ?it/s]

In [None]:
# select N_our, H_our, ID, entryID, seq_index, seq and save to csv
df = pd.DataFrame({'N_cat': preds, 'ID': chemical_shifts_df['ID'], 'entryID': chemical_shifts_df['entryID'], 'seq_index': chemical_shifts_df['seq_index'], 'seq': chemical_shifts_df['seq']})
df.to_csv("test_n_catboost_prostt5_5000.csv", index=False)