In [7]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from src.dataset import ProteinDataset
from src.utils import train_model, test_model
import torch
from src.model import ChemicalShiftsPredictor, ChemicalShiftsPredictorAttention
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
all_ucb_predictions = pd.read_csv("all_ucb_predictions.csv")

csv_file = 'data/strict.csv'
chemical_shifts_df = pd.read_csv(csv_file)
joined = pd.merge(chemical_shifts_df, all_ucb_predictions, on=['ID', 'seq_index'], how='inner', suffixes=('', '_ucb'))


In [11]:
target_col = "N"

In [16]:
baselines_train = chemical_shifts_df[~chemical_shifts_df['ID'].isin(joined['ID'])]

# mean baseline
mean_baseline = baselines_train[target_col].mean()
joined[f'mean_baseline_{target_col}'] = mean_baseline

# mean per sequence baseline
baseline2 = baselines_train.groupby('seq')[target_col].mean()
joined[f"mean_per_res_baseline_{target_col}"] = joined['seq'].map(baseline2)

In [18]:
n_filtered = joined.dropna(subset=['N', 'N_ucb'], axis=0)

In [None]:
# bootstrap first baseline to estimate 0.95 confidence interval for RMSE
errors = (n_filtered ['target_col'] - n_filtered [f'mean_baseline_N']) ** 2

bootstrap_rmse = []
for i in range(1000):
    sample = errors.sample(frac=0.5, replace=True)
    bootstrap_rmse.append(np.sqrt(sample.mean()))
    
bootstrap_rmse = np.array(bootstrap_rmse)
mean_baseline_rmse = np.sqrt(errors.mean())
mean_baseline_rmse_ci = np.quantile(bootstrap_rmse, [0.025, 0.975])

# bootstrap second baseline to estimate 0.95 confidence interval for RMSE
errors = (n_filtered ['target_col'] - n_filtered [f'mean_per_res_baseline_N']) ** 2
bootstrap_rmse = []
for i in range(1000):
    sample = errors.sample(frac=0.5, replace=True)
    bootstrap_rmse.append(np.sqrt(sample.mean()))

bootstrap_rmse = np.array(bootstrap_rmse)
mean_per_res_baseline_rmse = np.sqrt(errors.mean())
mean_per_res_baseline_rmse_ci = np.quantile(bootstrap_rmse, [0.025, 0.975])


# bootstrap UCB to estimate 0.95 confidence interval for RMSE
errors = (joined['target_col'] - joined[f'N_ucb']) ** 2

bootstrap_rmse = []
for i in range(1000):
    sample = errors.sample(frac=0.5, replace=True)
    bootstrap_rmse.append(np.sqrt(sample.mean()))

bootstrap_rmse = np.array(bootstrap_rmse)
ucb_rmse = np.sqrt(errors.mean())
ucb_rmse_ci = np.quantile(bootstrap_rmse, [0.025, 0.975])