# Model Evaluation

![eval summary](../assets/evaluation_summary.png)

Frankly, this project is not actually ready to dust off the test set (saved to disk apart from training and validation used during development). I would normally undergo a few more rounds of development, making an effort to optimize over available hyperparameters, before letting myself or the models see the test set. This notebook does give an idea of how I would approach final evaluation, but like the rest of the project it should be though of as a prototype rather than a formal report. 

In any case I thought it was important to break out the performance of the whole system by measurement type and enzyme, for a finer level of granularity than the sparse loss used by the models during training. 



## xformer_x003 and ensemble_004_seed13


|       |pKi:JAK1|pKi:JAK2|pKi:JAK3|pKi:TYK2|
|-------|-------|-------|-------|-------|
| test  |0.712+/-0.809|0.639+/-0.552|1.28+/-1.16|0.862+/-0.772|
| train |0.618+/-0.591|.619+/-0.535|1.02+/-0.873|0.757+/-0.595|

<div align="center">
    Evaluation, mean absolute error on train and test set
    </div>


Training a bootstrap ensemble of models to estimate kinase inhibition measurements gives some idea about the uncertainty of a given estimate in the variance of the population of estimates generated by the models. Here we can get an idea of how well the bootstrap ensemble provides uncertainty by counting the number of measurements that fall with 1, 2, or 3 standard deviations of the ensemble mean. 



```
evaluation on test set
353.0 of 1047 within 1 boostrap ensemble standard deviations 33.715377268385865%
606.0 of 1047 within 2 boostrap ensemble standard deviations 57.87965616045845%
801.0 of 1047 within 3 boostrap ensemble standard deviations 76.50429799426934%

evaluation on train set
3384.0 of 9678 within 1 boostrap ensemble standard deviations 34.96590204587725%
5932.0 of 9678 within 2 boostrap ensemble standard deviations 61.293655713990496%
7703.0 of 9678 within 3 boostrap ensemble standard deviations 79.59289109320108%
```

## xformer_x_002_seed42 and ensemble_004_seed42

It's evident from l see in the training 


|       |pKi:JAK1|pKi:JAK2|pKi:JAK3|pKi:TYK2|
|-------|-------|-------|-------|-------|
| test  |0.999+/-0.916|1.07+/-0.822|1.95+/-1.24|0.740+/-0.486|
| train |0.969+/-0.812|1.06+/-0.795|1.72+/-1.13|0.897+/-0.611|

<div align="center">
    Evaluation, mean absolute error on train and test set
    </div>
    

```
evaluation on test set
199.0 of 1047 within 1 boostrap ensemble standard deviations 19.00668576886342%
363.0 of 1047 within 2 boostrap ensemble standard deviations 34.67048710601719%
495.0 of 1047 within 3 boostrap ensemble standard deviations 47.277936962750715%

evaluation on train set
1585.0 of 9678 within 1 boostrap ensemble standard deviations 16.377350692291795%
3132.0 of 9678 within 2 boostrap ensemble standard deviations 32.36205827650341%
4514.0 of 9678 within 3 boostrap ensemble standard deviations 46.641868154577395%

```

In [None]:
import os
import json

import pandas as pd

import torch
import numpy as np

import jak
from jak.transformer import XFormer
import matplotlib.pyplot as plt

my_cmap = plt.get_cmap("viridis")

In [None]:
df = pd.read_csv("../logs/ensemble_003.txt")
df.head()

In [None]:
df.columns

In [None]:
# might be useful to look at training curves for the transformers 
# the first run plotted here had too high a learning rate and a loss spiek as a result

log_dir = "../logs"
my_extension = "txt"

log_listdir = os.listdir(log_dir)

for filename in log_listdir:
        
    if "xformer" in filename and filename.endswith(my_extension):
        
        fig = plt.figure()

        df = pd.read_csv(os.path.join(log_dir, filename))
        print(df.columns)
        
        x = df["epoch"].to_numpy()
        train_loss = df[" train_loss"].to_numpy()
        #train_std_dev = df[" train_std_dev"].to_numpy()
        val_loss = df[" val_loss"].to_numpy()
        #val_std_dev = df[" val_std_dev"].to_numpy()
        
        plt.plot(x, train_loss, label="training loss", lw=3, color=my_cmap(200), alpha=0.5)
        plt.plot(x, val_loss, label="val. loss", lw=3, color=my_cmap(10), alpha=0.5)
        plt.title(f"Transformer training run {filename}")
        plt.legend()
        
        plt.show()

In [None]:
# might be useful to look at training curves for the ensemble models as well

log_dir = "../logs"
my_extension = "txt"

log_listdir = os.listdir(log_dir)

for filename in log_listdir:
        
    if "ensemble" in filename and filename.endswith(my_extension):
        
        fig = plt.figure()

        df = pd.read_csv(os.path.join(log_dir, filename))
        
        x = df["epoch"].to_numpy()
        train_loss = df[" train_loss"].to_numpy()
        train_std_dev = df[" train_std_dev"].to_numpy()
        val_loss = df[" val_loss"].to_numpy()
        val_std_dev = df[" val_std_dev"].to_numpy()
        
        plt.plot(x, train_loss, label="training loss", color=my_cmap(256), alpha=0.5)
        plt.plot(x, val_loss, label="val. loss", color=my_cmap(10), alpha=0.5)
        plt.legend()
        plt.fill_between(x, val_loss - val_std_dev, val_loss + val_std_dev, color=my_cmap(10), alpha=0.25)
        
        plt.fill_between(x, train_loss - train_std_dev, train_loss + train_std_dev, color=my_cmap(256), alpha=0.25)
        plt.title(f"MLP ensemble {filename}")
        plt.show()

In [None]:
# set up feature extractor model (pre-trained autoregressive transformer)

seq_length = 100
token_dim = 33
encoder_size = 1
decoder_size = 1
smiles_vocab = "#()+-1234567=BCFHINOPS[]cilnors"
parameters_fp = "../parameters/xformer_x003_seed42/tag_xformer_x003_seed42_epoch299.pt"



xformer = XFormer(vocab = smiles_vocab, \
                  token_dim=token_dim, \
                  seq_length=seq_length, \
                  lr=1e-3,\
                  device="cpu",\
                  tag="inference"
                 )

model_state_dict = torch.load(parameters_fp, map_location=xformer.my_device)
xformer.load_state_dict(model_state_dict)

# set up regression model ensemble (multiple MLPs)

kwarg_filepath = "../parameters/ensemble_004_seed13/exp_tag_ensemble_004_seed13.json"
parameters_filepath = "../parameters/ensemble_004_seed13/tag_ensemble_004_seed13_epoch4999.pt"

with open(kwarg_filepath, "r") as f:
    kwargs = json.load(f)
    
in_dim = 3300
out_dim = 8
    
ensemble = jak.mlp.MLPCohort(in_dim, out_dim,\
                             cohort_size=kwargs["cohort_size"], \
                             depth=kwargs["depth"], \
                             lr=kwargs["lr"])

ensemble.load_state_dict(torch.load(parameters_filepath))
                             

In [None]:
# evaluate on the test set
test_df = pd.read_csv("../data/test_JAK.csv")
train_df = pd.read_csv("../data/train_JAK.csv")
unique_smiles = test_df["SMILES"].unique()

In [None]:
losses = {}
losses["JAK1_pKi"] = []
losses["JAK2_pKi"] = []
losses["JAK3_pKi"] = []
losses["TYK2_pKi"] = []

losses["JAK1_pIC50"] = []
losses["JAK2_pIC50"] = []
losses["JAK3_pIC50"] = []
losses["TYK2_pIC50"] = []

std_dev_diff = {}
std_dev_diff["JAK1_pKi"] = []
std_dev_diff["JAK2_pKi"] = []
std_dev_diff["JAK3_pKi"] = []
std_dev_diff["TYK2_pKi"] = []

std_dev_diff["JAK1_pIC50"] = []
std_dev_diff["JAK2_pIC50"] = []
std_dev_diff["JAK3_pIC50"] = []
std_dev_diff["TYK2_pIC50"] = []


loss_summary = ""
for label, df in zip(["test", "train"], [test_df, train_df]):
    within_1sd = 0
    within_2sd = 0
    within_3sd = 0

    total_count = 0
    unique_smiles = df["SMILES"].unique()
    with torch.no_grad():
        for smile in unique_smiles:
            s_df = df.loc[df["SMILES"] == smile]
            xformer.eval()
            encoded = xformer.encode(smile).reshape(1, -1)

            ensemble.eval()
            mean_estimates, std_estimates = ensemble.forward(encoded)

            for ii, measurement_type in enumerate(["pIC50", "pKi"]):
                m_df = s_df.loc[s_df["measurement_type"] == measurement_type]

                if len(m_df):
                    for jj, enzyme in enumerate(["JAK1", "JAK2", "JAK3", "TYK2"]):

                        e_df = m_df.loc[m_df["Kinase_name"] == enzyme]
                        if len(e_df):
                            idx = ii * 4 + jj
                            loss = np.abs(\
                                    e_df["measurement_value"].to_numpy() - mean_estimates[0,idx].cpu().numpy()).item()
                            losses[f"{enzyme}_{measurement_type}"].append(loss)
                            std_devs = loss / std_estimates[0,idx].numpy()
                            std_dev_diff[f"{enzyme}_{measurement_type}"].append(std_devs)

                            within_1sd += 1.0 *(std_devs <= 1.0)
                            within_2sd += 1.0 *(std_devs <= 2.0)
                            within_3sd += 1.0 *(std_devs <= 3.0)
                            total_count += 1
    
    print(f"evaluation on {label} set")
    print(f"{within_1sd} of {total_count} within 1 boostrap ensemble standard deviations {100.0 * within_1sd / total_count}%")
    print(f"{within_2sd} of {total_count} within 2 boostrap ensemble standard deviations {100.0 * within_2sd / total_count}%")
    print(f"{within_3sd} of {total_count} within 3 boostrap ensemble standard deviations {100.0 * within_3sd / total_count}%")                        
    
    fig, ax = plt.subplots(2,2, figsize=(12, 12))

    all_losses = []
    x_ticklabels = []

    for ii, measurement_type in enumerate(["pIC50", "pKi"]):
        for jj, enzyme in enumerate(["JAK1", "JAK2", "JAK3", "TYK2"]):
            idx = 1.0* ii*4 + jj

            all_losses.append(losses[f"{enzyme}_{measurement_type}"])
            x_ticklabels.append(f"{measurement_type}:{enzyme}")
            mean_loss = np.mean(losses[f"{enzyme}_{measurement_type}"])
            
            std_dev_of_loss = np.std(losses[f"{enzyme}_{measurement_type}"])
            loss_summary += f"\n{label} set MAE {measurement_type}:{enzyme}: {mean_loss}$\pm${std_dev_of_loss} s.d."
    
    ax[0,0].boxplot(all_losses[:4])
    ax[0,0].set_xticklabels(x_ticklabels[:4])
    ax[0,0].set_title(f"{label} set losses, pCI50")
    
    ax[0,1].boxplot(all_losses[4:])
    ax[0,1].set_xticklabels(x_ticklabels[4:])
    ax[0,1].set_title(f"{label} set losses, pKi")

    all_std_devs = []
    
    for ii, measurement_type in enumerate(["pIC50", "pKi"]):
        for jj, enzyme in enumerate(["JAK1", "JAK2", "JAK3", "TYK2"]):
            idx = 1.0* ii*4 + jj

            all_std_devs.append(std_dev_diff[f"{enzyme}_{measurement_type}"])
            mean_sigma = np.mean(losses[f"{enzyme}_{measurement_type}"])
            
            std_dev_of_sigma = np.std(losses[f"{enzyme}_{measurement_type}"])
            loss_summary += f"\n{label} set loss/$\sigma$ {measurement_type}:{enzyme}: {mean_sigma}$\pm${std_dev_of_sigma}"

    ax[1,0].boxplot(all_std_devs[:4])
    ax[1,0].set_xticklabels(x_ticklabels[:4])
    ax[1,0].set_title(f"{label} set loss/ ensemble std.dev pIC50")
    
    ax[1,1].boxplot(all_std_devs[4:])
    ax[1,1].set_xticklabels(x_ticklabels[4:])
    ax[1,1].set_title(f"{label} set loss/ ensemble std.dev pIC50")
    plt.savefig("../assets/evaluation_summary.png")
    plt.show()

print(loss_summary)