In [2]:
#!pip install torch pandas numpy h5py tqdm scikit-learn tensorboard

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting protobuf>=3.20 (from tensorboardX)
  Downloading protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: protobuf, tensorboardX
Successfully installed protobuf-4.25.2 tensorboardX-2.6.2.2


In [13]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from src.dataset import ProteinDataset
from src.utils import train_model, test_model
import torch
from src.model import ChemicalShiftsPredictor, ChemicalShiftsPredictorAttention

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
# Load and prepare data
csv_file = 'data/strict.csv'
prott5_file = 'data/embeddings/unfiltered_all_prott5.h5'
prott5_res_file = 'data/embeddings/unfiltered_all_prott5_res.h5'
prostt5_file = 'data/embeddings/prostt5.h5'
esm_file = 'data/embeddings/unfiltered_all_esm2_3b.h5'
esm_res_file = 'data/embeddings/unfiltered_all_esm2_3b_res.h5'
chemical_shifts_df = pd.read_csv(csv_file)
#chemical_shifts_df.describe()

In [15]:
test_ids = []
with open("pdb_matched/final_test_ids.txt", "r") as f:
    for line in f:
        test_ids.append(line.strip())

In [16]:
all_ucb_predictions = pd.read_csv("all_ucb_predictions.csv")

In [39]:
chemical_shifts_df[chemical_shifts_df['ID'].isin(test_ids)]

Unnamed: 0.1,Unnamed: 0,ID,entryID,stID,entity_assemID,entityID,seq_index,seq,k,zscores,pscores,C,CA,CB,HA,H,N,HB
447,6,30161_1_1_1,30161,1,1,1,1,M,7,,,175.986,55.600,33.138,4.401,,121.816,1.9350
448,6,30161_1_1_1,30161,1,1,1,2,I,14,11.1609,0.0947,174.316,61.588,38.193,3.830,8.107,123.431,1.7160
449,6,30161_1_1_1,30161,1,1,1,3,R,21,14.1291,0.0736,176.156,57.343,32.755,4.716,9.806,127.627,1.8845
450,6,30161_1_1_1,30161,1,1,1,4,T,21,15.1705,0.0332,173.925,58.964,71.096,5.291,8.631,110.835,4.3560
451,6,30161_1_1_1,30161,1,1,1,5,I,21,14.7280,0.0497,173.317,59.542,43.057,4.875,8.368,120.009,1.2950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215543,1893,36334_1_1_1,36334,1,1,1,106,H,1,3.8761,0.0000,,,,,,,
215544,1893,36334_1_1_1,36334,1,1,1,107,H,2,2.3427,0.4918,,,,,,,
215545,1893,36334_1_1_1,36334,1,1,1,108,H,2,2.3427,0.4918,,,30.053,,8.212,,
215546,1893,36334_1_1_1,36334,1,1,1,109,H,2,,,,,,,,,


In [33]:
# join ucb predictions and chemical shifts dataframes on ID and seq_index
joined = pd.merge(chemical_shifts_df, all_ucb_predictions, on=['ID', 'seq_index'], how='inner', suffixes=('', '_ucb'))

In [35]:
#drop na based on columns N, N_ucb
joined = joined.dropna(subset=['N', 'N_ucb'], axis=0)

In [36]:
len(joined)

11341

In [19]:
# calculate RMSE error between N and N_ucb
rmse = ((joined['N'] - joined['N_ucb']) ** 2).mean() ** .5
rmse


2.7436470325525435

In [20]:
chemical_shifts_df = chemical_shifts_df[chemical_shifts_df['ID'].isin(test_ids)]

In [29]:
scaler_applied = False

In [30]:
#target_columns = ['C', 'CA', 'CB', 'HA', 'H', 'N', 'HB']
target_columns = ['N']
chemical_shifts_df.dropna(inplace=True, subset=target_columns)


scaler = joblib.load('scaler.joblib')
means = scaler.mean_
stds = scaler.scale_

# Apply normalization to the training targets
#train_df[target_columns] = scaler.transform(train_targets)

# Apply the same normalization to validation and test sets
# val_df[target_columns] = scaler.transform(val_df[target_columns])
# test_df[target_columns] = scaler.transform(test_df[target_columns])

# Create datasets
if not scaler_applied:
    chemical_shifts_df[target_columns] = scaler.transform(chemical_shifts_df[target_columns])
    joined[target_columns] = scaler.transform(joined[target_columns])
    scaler_applied = True
test_dataset = ProteinDataset(target_columns, joined, prott5_file, prott5_res_file, prostt5_file, esm_res_file, esm_file)

In [23]:
print('Test dataset length:', len(test_dataset))

Test dataset length: 11341


In [24]:
learning_rate = 0.001
weight_decay = 1e-5
patience = 10
batch_size = 128
num_epochs = 5

use_prostt5 = True
use_protein_mean = True
use_attention = True

In [25]:
model = ChemicalShiftsPredictor(use_prostt5=use_prostt5, use_protein_mean=use_protein_mean, use_attention=use_attention)
model.load_state_dict(torch.load('Full_1e-4.pth'))

model = model.cuda()

In [31]:
test_model(model, test_dataset, batch_size=batch_size, use_prostt5=use_prostt5, use_protein_mean=use_protein_mean, use_attention=use_attention, reinit_model=False, scaler=scaler)

  0%|          | 0/89 [00:00<?, ?it/s]

100%|██████████| 89/89 [00:10<00:00,  8.39it/s]

Test Loss: 0.3460
Test RMSE: 2.9873





In [29]:
import os

ucb_predictions = {}

for filename in os.listdir("output_ucb"):
    file_path = os.path.join("output_ucb", filename)
    id = filename.split(';')[0]
    ucb_predictions[id] = pd.read_csv(file_path)

In [27]:
three_to_one = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C', 
    'GLU': 'E', 'GLN': 'Q', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 
    'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 
    'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V',
}

def process_ucb_output(dataframe, id):
    new_columns = {}
    new_columns["ID"] = id
    new_columns['seq_index'] = dataframe["RESNUM"] - min(dataframe["RESNUM"]) + 1
    new_columns['seq'] = [three_to_one[res] for res in dataframe["RESNAME"]]
    new_columns['H'] = dataframe["H_UCBShift"]
    new_columns['N'] = dataframe["N_UCBShift"]
    new_df = pd.DataFrame(new_columns)
    return new_df

In [32]:
process_ucb_output(ucb_predictions['34695_1_1_1'], '34695_1_1_1')

# process all ucb predictions, make single dataframe
all_ucb_predictions = []
for id, dataframe in ucb_predictions.items():
    all_ucb_predictions.append(process_ucb_output(dataframe, id))
    
all_ucb_predictions = pd.concat(all_ucb_predictions)

In [34]:
all_ucb_predictions.to_csv('all_ucb_predictions.csv', index=False)

In [None]:
torch.cuda.empty_cache()
trained_model = train_model(train_dataset, val_dataset, learning_rate=learning_rate, num_epochs=50, weight_decay=weight_decay, patience=patience, batch_size=2048, use_prostt5=True, use_protein_mean=True, use_esm2=False)
test_model(trained_model, test_dataset, batch_size=batch_size, use_prostt5=True, use_protein_mean=True, use_esm2=False)