In [1]:
import pandas as pd
import numpy as np
import pickle
import torch
from module.RNN import *
from module.DNN import *
from module.helpers import *

In [2]:
with open("./model/Sequence_voca.pkl", "rb") as f:
    Protein_voca = pickle.load(f)

with open("./model/SMILES_voca.pkl", "rb") as f:
    SMILES_voca = pickle.load(f)

In [3]:
USE_CUDA = True
GPU_NUM = 2

if USE_CUDA:
    device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(device)
    print ('Current cuda device ', torch.cuda.current_device())

Current cuda device  2


In [4]:
def log_transform(val):
    return - np.log10(val * 1e-9)

def check_afordable_range(input):

    for idx, i in enumerate(input):
        if i > 1e+7: 
            input[idx] = 1e+7

        elif i < 1e-3:
            input[idx] = 1e-3
            
    return input

#### 1. Davis model

In [5]:
model_path = "./model/train_davis.pth"  

In [6]:
davis_regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

#### 2. KIBA model

In [7]:
model_path = "./model/train_kiba.pth"  

In [8]:
kiba_regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

#### 3. GLASS model

In [9]:
model_path = "./model/train_glass.pth"  

In [10]:
glass_regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

#### 4. BindingDB model

In [11]:
model_path = "./model/train_bindingdb.pth"  

In [12]:
bindingdb_regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

#### 5. Merged model

In [13]:
model_path = "./model/train_merged.pth"  

In [14]:
merged_regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

### 1. Davis data load

In [15]:
davis_braf = pd.read_csv("/home/ssm/seq_aff/code/BA_module/data/braf_bcl2/davis_braf.tsv", sep = "\t")
davis_braf

Unnamed: 0,Protein,PubChem CID,Sequence,SMILES,Kd (nM)
0,BRAF,11314340,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,10000.0
1,BRAF,24889392,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3cn4c(n3)sc3cc(OCC...,10000.0
2,BRAF,11409972,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCN1CCN(Cc2ccc(NC(=O)Nc3ccc(Oc4cc(NC)ncn4)cc3)...,1700.0
3,BRAF,11338033,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=C(NC1CCNCC1)c1[nH]ncc1NC(=O)c1c(Cl)cccc1Cl,10000.0
4,BRAF,10184653,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,10000.0
...,...,...,...,...,...
63,BRAF,5494449,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1cc(Nc2cc(N3CCN(C)CC3)nc(Sc3ccc(NC(=O)C4CC4)...,10000.0
64,BRAF,3038525,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=c1ncn2nc(Sc3ccc(F)cc3F)ccc2c1-c1c(Cl)cccc1Cl,10000.0
65,BRAF,3081361,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,10000.0
66,BRAF,9809715,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COC(=O)c1ccc2c(c1)NC(=O)C2=C(Nc1ccc(N(C)C(=O)C...,10000.0


In [16]:
davis_braf["Parameter"] = "Kd (nM)"
davis_braf.columns = ["Protein", "PubChem CID", "Sequence", "SMILES", "Value", "Parameter"]
davis_braf

Unnamed: 0,Protein,PubChem CID,Sequence,SMILES,Value,Parameter
0,BRAF,11314340,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,10000.0,Kd (nM)
1,BRAF,24889392,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3cn4c(n3)sc3cc(OCC...,10000.0,Kd (nM)
2,BRAF,11409972,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCN1CCN(Cc2ccc(NC(=O)Nc3ccc(Oc4cc(NC)ncn4)cc3)...,1700.0,Kd (nM)
3,BRAF,11338033,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=C(NC1CCNCC1)c1[nH]ncc1NC(=O)c1c(Cl)cccc1Cl,10000.0,Kd (nM)
4,BRAF,10184653,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,10000.0,Kd (nM)
...,...,...,...,...,...,...
63,BRAF,5494449,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1cc(Nc2cc(N3CCN(C)CC3)nc(Sc3ccc(NC(=O)C4CC4)...,10000.0,Kd (nM)
64,BRAF,3038525,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=c1ncn2nc(Sc3ccc(F)cc3F)ccc2c1-c1c(Cl)cccc1Cl,10000.0,Kd (nM)
65,BRAF,3081361,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,10000.0,Kd (nM)
66,BRAF,9809715,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COC(=O)c1ccc2c(c1)NC(=O)C2=C(Nc1ccc(N(C)C(=O)C...,10000.0,Kd (nM)


In [17]:
test_data = davis_braf.iloc[:, [2, 3]].values
test_labels = log_transform(davis_braf.iloc[:, 4].values)
print("test data", test_data.shape)
print("test labels", test_labels.shape)

test data (68, 2)
test labels (68,)


In [18]:
test_dataset = test_Dataset(test_data)
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

#### 1.1 train davis test

In [19]:
Test_module = Test(davis_regressor, test_loader)
y_pred_davis = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_davis)

print(f"\t[Val] total samples: {len(y_pred_davis)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 68, RMSE: 0.350, PCC: 0.944, CI: 0.948, r2: 0.801


#### 1.2 train kiba test

In [20]:
Test_module = Test(kiba_regressor, test_loader)
y_pred_kiba = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_kiba)

print(f"\t[Val] total samples: {len(y_pred_kiba)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 68, RMSE: 6.706, PCC: 0.430, CI: 0.688, r2: -72.074


#### 1.3 train glass test

In [21]:
Test_module = Test(glass_regressor, test_loader)
y_pred_glass = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_glass)

print(f"\t[Val] total samples: {len(y_pred_glass)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 68, RMSE: 1.954, PCC: -0.006, CI: 0.581, r2: -5.202


#### 1.4 train bindingdb test

In [22]:
Test_module = Test(bindingdb_regressor, test_loader)
y_pred_bindingdb = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_bindingdb)

print(f"\t[Val] total samples: {len(y_pred_bindingdb)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 68, RMSE: 1.209, PCC: 0.505, CI: 0.649, r2: -1.374


#### 1.5 train merged test

In [23]:
Test_module = Test(merged_regressor, test_loader)
y_pred_merged = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_merged)

print(f"\t[Val] total samples: {len(y_pred_merged)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 68, RMSE: 0.547, PCC: 0.772, CI: 0.889, r2: 0.515


### 2. KIBA data load

In [24]:
kiba_braf = pd.read_csv("/home/ssm/seq_aff/code/BA_module/data/braf_bcl2/kiba_braf.tsv", sep = "\t")
kiba_braf

Unnamed: 0,Protein,ChEMBL ID of Ligand,Sequence,SMILES,KIBA score
0,P15056,CHEMBL1087421,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,11.1
1,P15056,CHEMBL1088633,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2c(cc1Cl)C(c1cccc(Cl)c1)=NCC2,11.1
2,P15056,CHEMBL1967878,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1cc(Nc2ncc(F)c(NC3C4C=CC(C4)C3C(N)=O)n2)ccc1...,11.3
3,P15056,CHEMBL1999321,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=c1c(NCCc2ccc(Oc3ccccc3)cc2)c(Nc2ccncc2)c1=O,11.2
4,P15056,CHEMBL2205766,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)(C)NS(=O)(=O)c1cncc(-c2ccn3nc(N)nc3c2)c1,11.1
5,P15056,CHEMBL1288582,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1ccc2c(c1)=NC(=C1NNc3cccnc31)C=2,10.622879
6,P15056,CHEMBL1957190,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)n1c2c(c3c1-c1ccccc1C3=O)C(=O)CCC2,10.555932
7,P15056,CHEMBL513846,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1nc2ccccn2c1-c1csc(Nc2ccc(O)cc2)n1,11.2
8,P15056,CHEMBL6246,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=c1oc2c(O)c(O)cc3c(=O)oc4c(O)c(O)cc1c4c23,11.854487
9,P15056,CHEMBL1087650,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN1CC2CC1CN2c1ccc(-c2ccnc3c(-c4cccc(O)c4)c(-c4...,15.520216


In [25]:
kiba_braf["Parameter"] = "KIBA score"
kiba_braf.columns = ["Protein", "ChEMBL ID of Ligand", "Sequence", "SMILES", "Value", "KIBA score"]
kiba_braf

Unnamed: 0,Protein,ChEMBL ID of Ligand,Sequence,SMILES,Value,KIBA score
0,P15056,CHEMBL1087421,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,11.1,KIBA score
1,P15056,CHEMBL1088633,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2c(cc1Cl)C(c1cccc(Cl)c1)=NCC2,11.1,KIBA score
2,P15056,CHEMBL1967878,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1cc(Nc2ncc(F)c(NC3C4C=CC(C4)C3C(N)=O)n2)ccc1...,11.3,KIBA score
3,P15056,CHEMBL1999321,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=c1c(NCCc2ccc(Oc3ccccc3)cc2)c(Nc2ccncc2)c1=O,11.2,KIBA score
4,P15056,CHEMBL2205766,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)(C)NS(=O)(=O)c1cncc(-c2ccn3nc(N)nc3c2)c1,11.1,KIBA score
5,P15056,CHEMBL1288582,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1ccc2c(c1)=NC(=C1NNc3cccnc31)C=2,10.622879,KIBA score
6,P15056,CHEMBL1957190,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)n1c2c(c3c1-c1ccccc1C3=O)C(=O)CCC2,10.555932,KIBA score
7,P15056,CHEMBL513846,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1nc2ccccn2c1-c1csc(Nc2ccc(O)cc2)n1,11.2,KIBA score
8,P15056,CHEMBL6246,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=c1oc2c(O)c(O)cc3c(=O)oc4c(O)c(O)cc1c4c23,11.854487,KIBA score
9,P15056,CHEMBL1087650,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN1CC2CC1CN2c1ccc(-c2ccnc3c(-c4cccc(O)c4)c(-c4...,15.520216,KIBA score


In [26]:
test_data = kiba_braf.iloc[:, [2, 3]].values
test_labels = kiba_braf.iloc[:, 4].values
print("test data", test_data.shape)
print("test labels", test_labels.shape)

test data (36, 2)
test labels (36,)


In [27]:
test_dataset = test_Dataset(test_data)
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

#### 2.1 train davis test

In [28]:
Test_module = Test(davis_regressor, test_loader)
y_pred_davis = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_davis)

print(f"\t[Val] total samples: {len(y_pred_davis)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 36, RMSE: 5.872, PCC: -0.304, CI: 0.417, r2: -23.575


#### 2.2 train kiba test

In [29]:
Test_module = Test(kiba_regressor, test_loader)
y_pred_kiba = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_kiba)

print(f"\t[Val] total samples: {len(y_pred_kiba)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 36, RMSE: 0.240, PCC: 0.980, CI: 0.938, r2: 0.959


#### 2.3 train glass test

In [30]:
Test_module = Test(glass_regressor, test_loader)
y_pred_glass = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_glass)

print(f"\t[Val] total samples: {len(y_pred_glass)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 36, RMSE: 5.080, PCC: 0.138, CI: 0.544, r2: -17.394


#### 2.4 train bindingdb test

In [31]:
Test_module = Test(bindingdb_regressor, test_loader)
y_pred_bindingdb = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_bindingdb)

print(f"\t[Val] total samples: {len(y_pred_bindingdb)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 36, RMSE: 5.342, PCC: 0.203, CI: 0.579, r2: -19.337


#### 2.5 train merged test

In [32]:
Test_module = Test(merged_regressor, test_loader)
y_pred_merged = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_merged)

print(f"\t[Val] total samples: {len(y_pred_merged)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 36, RMSE: 5.440, PCC: 0.183, CI: 0.586, r2: -20.090


### 3. Bindingdb braf ki data load

In [33]:
bindingdb_braf_ki = pd.read_csv("/home/ssm/seq_aff/code/BA_module/data/braf_bcl2/bindingdb_braf_ki.tsv", sep = "\t")
bindingdb_braf_ki

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Ki (nM)
0,P15056,3106064,,,,,,ZINC02361683,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cn1c(=O)n(Cc2ccccc2)c(=O)c2c1nc(SCCO)n2Cc1ccccc1,106.0
1,P15056,216239,50924.0,CHEMBL1336,DB00398,5711.0,,ZINC01493878,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,38.0
2,P15056,91369525,,CHEMBL200622,,,,ZINC89224159,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CCOc1ccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CC...,0.16
3,P15056,44223999,,,,,,ZINC39300628,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1ccc2c(Nc3ccc(Cl)cc3)nccc2c1Nc1ncccc1-c1ncnc...,1.0
4,P15056,91369525,,CHEMBL200622,,,,ZINC89224159,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CCOc1ccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CC...,0.3
5,P15056,11717001,,CHEMBL525191,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,OCCn1cc(-c2ccc3c(c2)CCC3=NO)c(-c2ccncc2)n1,0.13
6,P15056,216239,50924.0,CHEMBL1336,DB00398,5711.0,,ZINC01493878,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,22.0
7,P15056,91369525,,CHEMBL200622,,,,ZINC89224159,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CCOc1ccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CC...,0.16
8,P15056,24180719,,,DB06999,5703.0,,ZINC39059267,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCCS(=O)(=O)Nc1ccc(F)c(C(=O)c2c[nH]c3ncc(Cl)cc...,2.6
9,P15056,91448975,,CHEMBL525191,,,,ZINC34640412,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=NC1CCc2cc(-c3cn(CCO)nc3-c3ccncc3)ccc21,0.17


In [34]:
bindingdb_braf_ki["Parameter"] = "Ki (nM)"
bindingdb_braf_ki.columns = ["Protein", "PubChem CID", "ChEBI ID of Ligand", "ChEMBL ID of Ligand", "DrugBank ID of Ligand", "IUPHAR_GRAC ID of Ligand", 
                             "KEGG ID of Ligand", "ZINC ID of Ligand", "Sequence", "SMILES", "Value", "Parameter"]
bindingdb_braf_ki

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Value,Parameter
0,P15056,3106064,,,,,,ZINC02361683,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cn1c(=O)n(Cc2ccccc2)c(=O)c2c1nc(SCCO)n2Cc1ccccc1,106.0,Ki (nM)
1,P15056,216239,50924.0,CHEMBL1336,DB00398,5711.0,,ZINC01493878,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,38.0,Ki (nM)
2,P15056,91369525,,CHEMBL200622,,,,ZINC89224159,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CCOc1ccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CC...,0.16,Ki (nM)
3,P15056,44223999,,,,,,ZINC39300628,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1ccc2c(Nc3ccc(Cl)cc3)nccc2c1Nc1ncccc1-c1ncnc...,1.0,Ki (nM)
4,P15056,91369525,,CHEMBL200622,,,,ZINC89224159,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CCOc1ccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CC...,0.3,Ki (nM)
5,P15056,11717001,,CHEMBL525191,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,OCCn1cc(-c2ccc3c(c2)CCC3=NO)c(-c2ccncc2)n1,0.13,Ki (nM)
6,P15056,216239,50924.0,CHEMBL1336,DB00398,5711.0,,ZINC01493878,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,22.0,Ki (nM)
7,P15056,91369525,,CHEMBL200622,,,,ZINC89224159,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CCOc1ccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CC...,0.16,Ki (nM)
8,P15056,24180719,,,DB06999,5703.0,,ZINC39059267,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCCS(=O)(=O)Nc1ccc(F)c(C(=O)c2c[nH]c3ncc(Cl)cc...,2.6,Ki (nM)
9,P15056,91448975,,CHEMBL525191,,,,ZINC34640412,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=NC1CCc2cc(-c3cn(CCO)nc3-c3ccncc3)ccc21,0.17,Ki (nM)


In [35]:
test_data = bindingdb_braf_ki.iloc[:, [8, 9]].values
test_labels = log_transform(check_afordable_range(bindingdb_braf_ki.iloc[:, 10].values))
print("test data", test_data.shape)
print("test labels", test_labels.shape)

test data (11, 2)
test labels (11,)


In [36]:
test_dataset = test_Dataset(test_data)
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

#### 3.1 train davis test

In [37]:
Test_module = Test(davis_regressor, test_loader)
y_pred_davis = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_davis)

print(f"\t[Val] total samples: {len(y_pred_davis)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 11, RMSE: 2.564, PCC: 0.088, CI: 0.423, r2: -4.995


#### 3.2 train kiba test

In [38]:
Test_module = Test(kiba_regressor, test_loader)
y_pred_kiba = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_kiba)

print(f"\t[Val] total samples: {len(y_pred_kiba)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 11, RMSE: 3.487, PCC: -0.340, CI: 0.538, r2: -10.083


#### 3.3 train glass test

In [39]:
Test_module = Test(glass_regressor, test_loader)
y_pred_glass = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_glass)

print(f"\t[Val] total samples: {len(y_pred_glass)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 11, RMSE: 2.357, PCC: 0.536, CI: 0.596, r2: -4.064


#### 3.4 train bindingdb test

In [40]:
Test_module = Test(bindingdb_regressor, test_loader)
y_pred_bindingdb = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_bindingdb)

print(f"\t[Val] total samples: {len(y_pred_bindingdb)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 11, RMSE: 1.108, PCC: 0.887, CI: 0.808, r2: -0.120


#### 3.5 train merged test

In [41]:
Test_module = Test(merged_regressor, test_loader)
y_pred_merged = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_merged)

print(f"\t[Val] total samples: {len(y_pred_merged)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 11, RMSE: 1.847, PCC: 0.736, CI: 0.788, r2: -2.110


### 4. Bindingdb braf kd load

In [64]:
bindingdb_braf_kd = pd.read_csv("/home/ssm/seq_aff/code/BA_module/data/braf_bcl2/bindingdb_braf_kd.tsv", sep = "\t")
bindingdb_braf_kd

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Kd (nM)
0,P15056,91369525,,CHEMBL200622,,,,ZINC89224159,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CCOc1ccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CC...,0.30
1,P15056,9950176,,CHEMBL373011,,,,ZINC28563889,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Oc1cc(-c2[nH]c(-c3ccccc3)nc2-c2ccncc2)ccc1Cl,2.40
2,P15056,91896047,,,,,,ZINC40407755,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(COc1nccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CCC4...,0.50
3,P15056,91501740,,,,,,ZINC40918549,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)(CN)c1nc(-c2ccncc2)c(-c2ccc3c(c2)CCC3N=O)...,4.90
4,P15056,73010396,,,,,,ZINC03943386,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=NC1CCc2cc(-c3[nH]c(C4CCNCC4)nc3-c3ccncc3)ccc21,1.70
...,...,...,...,...,...,...,...,...,...,...,...
327,P15056,44243260,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2c(Oc3cccc(NC(=O)Nc4cc(C(C)(C)C)on4)c3)n...,34.00
328,P15056,60152613,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCCS(=O)(=O)Nc1ccc(F)c(-n2cc(-c3cncnc3)c3nc(N(...,0.33
329,P15056,42611257,63637.0,,DB08881,,,ZINC52509366,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCCS(=O)(=O)Nc1ccc(F)c(C(=O)c2c[nH]c3ncc(-c4cc...,51.00
330,P15056,60152613,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCCS(=O)(=O)Nc1ccc(F)c(-n2cc(-c3cncnc3)c3nc(N(...,0.37


In [65]:
bindingdb_braf_kd["Parameter"] = "Kd (nM)"
bindingdb_braf_kd.columns = ["Protein", "PubChem CID", "ChEBI ID of Ligand", "ChEMBL ID of Ligand", "DrugBank ID of Ligand", "IUPHAR_GRAC ID of Ligand", 
                             "KEGG ID of Ligand", "ZINC ID of Ligand", "Sequence", "SMILES", "Value", "Parameter"]
bindingdb_braf_kd

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Value,Parameter
0,P15056,91369525,,CHEMBL200622,,,,ZINC89224159,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CCOc1ccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CC...,0.30,Kd (nM)
1,P15056,9950176,,CHEMBL373011,,,,ZINC28563889,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Oc1cc(-c2[nH]c(-c3ccccc3)nc2-c2ccncc2)ccc1Cl,2.40,Kd (nM)
2,P15056,91896047,,,,,,ZINC40407755,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(COc1nccc(-c2nc(-c3ccncc3)c(-c3ccc4c(c3)CCC4...,0.50,Kd (nM)
3,P15056,91501740,,,,,,ZINC40918549,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)(CN)c1nc(-c2ccncc2)c(-c2ccc3c(c2)CCC3N=O)...,4.90,Kd (nM)
4,P15056,73010396,,,,,,ZINC03943386,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=NC1CCc2cc(-c3[nH]c(C4CCNCC4)nc3-c3ccncc3)ccc21,1.70,Kd (nM)
...,...,...,...,...,...,...,...,...,...,...,...,...
327,P15056,44243260,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,COc1cc2c(Oc3cccc(NC(=O)Nc4cc(C(C)(C)C)on4)c3)n...,34.00,Kd (nM)
328,P15056,60152613,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCCS(=O)(=O)Nc1ccc(F)c(-n2cc(-c3cncnc3)c3nc(N(...,0.33,Kd (nM)
329,P15056,42611257,63637.0,,DB08881,,,ZINC52509366,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCCS(=O)(=O)Nc1ccc(F)c(C(=O)c2c[nH]c3ncc(-c4cc...,51.00,Kd (nM)
330,P15056,60152613,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCCS(=O)(=O)Nc1ccc(F)c(-n2cc(-c3cncnc3)c3nc(N(...,0.37,Kd (nM)


In [67]:
test_data = bindingdb_braf_kd.iloc[:, [8, 9]].values
test_labels = log_transform(check_afordable_range(bindingdb_braf_kd.iloc[:, 10].values))
print("test data", test_data.shape)
print("test labels", test_labels.shape)

test data (332, 2)
test labels (332,)


In [68]:
test_dataset = test_Dataset(test_data)
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

#### 4.1 train davis test

In [69]:
Test_module = Test(davis_regressor, test_loader)
y_pred_davis = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_davis)

print(f"\t[Val] total samples: {len(y_pred_davis)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 332, RMSE: 1.278, PCC: 0.261, CI: 0.564, r2: -0.784


#### 4.2 train kiba test

In [70]:
Test_module = Test(kiba_regressor, test_loader)
y_pred_kiba = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_kiba)

print(f"\t[Val] total samples: {len(y_pred_kiba)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 332, RMSE: 5.912, PCC: -0.016, CI: 0.539, r2: -37.174


#### 4.3 train glass test

In [71]:
Test_module = Test(glass_regressor, test_loader)
y_pred_glass = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_glass)

print(f"\t[Val] total samples: {len(y_pred_glass)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 332, RMSE: 1.165, PCC: 0.072, CI: 0.506, r2: -0.483


#### 4.4 train bindingdb test

In [72]:
Test_module = Test(bindingdb_regressor, test_loader)
y_pred_bindingdb = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_bindingdb)

print(f"\t[Val] total samples: {len(y_pred_bindingdb)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 332, RMSE: 0.552, PCC: 0.824, CI: 0.779, r2: 0.667


#### 4.5 train merged test

In [73]:
Test_module = Test(merged_regressor, test_loader)
y_pred_merged = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_merged)

print(f"\t[Val] total samples: {len(y_pred_merged)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 332, RMSE: 0.587, PCC: 0.814, CI: 0.780, r2: 0.624


### 5. BindingDB blc2 ki data load

In [74]:
bindingdb_blc2_ki = pd.read_csv("/home/ssm/seq_aff/code/BA_module/data/braf_bcl2/bindingdb_blc2_ki.tsv", sep = "\t")
bindingdb_blc2_ki

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Ki (nM)
0,P10415,11556440,,CHEMBL192571,,,,ZINC14976048,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,67.0
1,P10415,16109064,,,,,,ZINC29464760,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,COC1(Cc2ccccc2)CCN(c2ccc(C(=O)NS(=O)(=O)c3ccc(...,8.1
2,P10415,91895979,,,,,,ZINC95612739,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,35.2
3,P10415,91895980,,,,,,ZINC95612906,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,6.5
4,P10415,16109115,,,,,,ZINC95612691,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CC1OC2(CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC(CCN(C...,9.6
...,...,...,...,...,...,...,...,...,...,...,...
1615,P10415,91933575,,CHEMBL1269107,,,,ZINC64527171,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,C=c1c(-c2c(O)c3cc(O)c(O)c(=Cc4ccccc4)c3c(O)c2=...,31.0
1616,P10415,71654876,,,,,,,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(c1cc(-c2cc3c(cc2C(=O)N2Cc4ccccc4CC2CN2CCOC...,1.3
1617,P10415,3503,,CHEMBL51483,,4204.0,C07667,ZINC03775575,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,Cc1cc2c(C(C)C)c(O)c(O)c(C=O)c2c(O)c1-c1c(C)cc2...,320.0
1618,P10415,57338853,,,,,,,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CC(C)Oc1cc(C(=O)Nc2ccc(C(=O)Nc3ccc(C(=O)O)cc3O...,179.0


In [75]:
bindingdb_blc2_ki["Parameter"] = "Ki (nM)"
bindingdb_blc2_ki.columns = ["Protein", "PubChem CID", "ChEBI ID of Ligand", "ChEMBL ID of Ligand", "DrugBank ID of Ligand", "IUPHAR_GRAC ID of Ligand", 
                             "KEGG ID of Ligand", "ZINC ID of Ligand", "Sequence", "SMILES", "Value", "Parameter"]
bindingdb_blc2_ki

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Value,Parameter
0,P10415,11556440,,CHEMBL192571,,,,ZINC14976048,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,67.0,Ki (nM)
1,P10415,16109064,,,,,,ZINC29464760,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,COC1(Cc2ccccc2)CCN(c2ccc(C(=O)NS(=O)(=O)c3ccc(...,8.1,Ki (nM)
2,P10415,91895979,,,,,,ZINC95612739,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,35.2,Ki (nM)
3,P10415,91895980,,,,,,ZINC95612906,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,6.5,Ki (nM)
4,P10415,16109115,,,,,,ZINC95612691,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CC1OC2(CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC(CCN(C...,9.6,Ki (nM)
...,...,...,...,...,...,...,...,...,...,...,...,...
1615,P10415,91933575,,CHEMBL1269107,,,,ZINC64527171,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,C=c1c(-c2c(O)c3cc(O)c(O)c(=Cc4ccccc4)c3c(O)c2=...,31.0,Ki (nM)
1616,P10415,71654876,,,,,,,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(c1cc(-c2cc3c(cc2C(=O)N2Cc4ccccc4CC2CN2CCOC...,1.3,Ki (nM)
1617,P10415,3503,,CHEMBL51483,,4204.0,C07667,ZINC03775575,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,Cc1cc2c(C(C)C)c(O)c(O)c(C=O)c2c(O)c1-c1c(C)cc2...,320.0,Ki (nM)
1618,P10415,57338853,,,,,,,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CC(C)Oc1cc(C(=O)Nc2ccc(C(=O)Nc3ccc(C(=O)O)cc3O...,179.0,Ki (nM)


In [76]:
test_data = bindingdb_blc2_ki.iloc[:, [8, 9]].values
test_labels = log_transform(check_afordable_range(bindingdb_blc2_ki.iloc[:, 10].values))
print("test data", test_data.shape)
print("test labels", test_labels.shape)

test data (1620, 2)
test labels (1620,)


In [77]:
test_dataset = test_Dataset(test_data)
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

#### 5.1 train davis test

In [78]:
Test_module = Test(davis_regressor, test_loader)
y_pred_davis = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_davis)

print(f"\t[Val] total samples: {len(y_pred_davis)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 1620, RMSE: 2.555, PCC: 0.206, CI: 0.537, r2: -0.830


#### 5.2 train kiba test

In [79]:
Test_module = Test(kiba_regressor, test_loader)
y_pred_kiba = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_kiba)

print(f"\t[Val] total samples: {len(y_pred_kiba)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 1620, RMSE: 4.057, PCC: -0.142, CI: 0.457, r2: -3.615


#### 5.3 train galss test

In [80]:
Test_module = Test(glass_regressor, test_loader)
y_pred_glass = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_glass)

print(f"\t[Val] total samples: {len(y_pred_glass)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 1620, RMSE: 2.492, PCC: 0.153, CI: 0.510, r2: -0.742


#### 5.4 train bindingdb test

In [81]:
Test_module = Test(bindingdb_regressor, test_loader)
y_pred_bindingdb = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_bindingdb)

print(f"\t[Val] total samples: {len(y_pred_bindingdb)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 1620, RMSE: 0.575, PCC: 0.960, CI: 0.891, r2: 0.907


#### 5.5 train merged test

In [82]:
Test_module = Test(merged_regressor, test_loader)
y_pred_merged = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_merged)

print(f"\t[Val] total samples: {len(y_pred_merged)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 1620, RMSE: 0.480, PCC: 0.968, CI: 0.898, r2: 0.935


### 6. BindingDB bcl2 kd data load

In [83]:
bindingdb_blc2_kd = pd.read_csv("/home/ssm/seq_aff/code/BA_module/data/braf_bcl2/bindingdb_blc2_kd.tsv", sep = "\t")
bindingdb_blc2_kd

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Kd (nM)
0,P10415,46836567,,CHEMBL3287293,,,,ZINC28014085,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(O)c1csc(N2CCc3cccc(C(=O)Nc4nc5ccccc5s4)c3C...,9200.0
1,P10415,46836568,,CHEMBL3287301,,,,ZINC28010479,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(Nc1nc2ccccc2s1)c1cccc2c1CN(c1nc(C(=O)O)c(C...,4400.0
2,P10415,52946911,,CHEMBL1270616,,,,ZINC64512357,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCCC(O)(c1ccc(Cl)cc1)c1ccc(Cl)cc1,20000.0
3,P10415,52946912,,CHEMBL1270617,,,,ZINC64513836,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCCC(O)(c1ccccc1)c1ccc(Cl)cc1,80000.0
4,P10415,845175,,CHEMBL1269479,,,,ZINC04785567,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCCC(O)(c1ccccc1)c1ccccc1,200000.0
5,P10415,2710,,CHEMBL22150,DB08936,,,ZINC19362737,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1,60000.0
6,P10415,52946715,,CHEMBL1269480,,,,ZINC64539135,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,NCC(O)(c1ccccc1)c1ccc(F)cc1Cl,250000.0
7,P10415,2782689,,CHEMBL106708,DB07108,,,ZINC02382451,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(O)c1ccc(-c2ccc(F)cc2)cc1,400000.0
8,P10415,2782689,,CHEMBL106708,DB07108,,,ZINC02382451,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(O)c1ccc(-c2ccc(F)cc2)cc1,430000.0
9,P10415,20679058,,CHEMBL1269503,,,,ZINC64512768,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CCc1cccc(-c2ccc(C(=O)O)cc2)c1,100000.0


In [84]:
bindingdb_blc2_kd["Parameter"] = "Kd (nM)"
bindingdb_blc2_kd.columns = ["Protein", "PubChem CID", "ChEBI ID of Ligand", "ChEMBL ID of Ligand", "DrugBank ID of Ligand", "IUPHAR_GRAC ID of Ligand", 
                             "KEGG ID of Ligand", "ZINC ID of Ligand", "Sequence", "SMILES", "Value", "Parameter"]
bindingdb_blc2_kd

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Value,Parameter
0,P10415,46836567,,CHEMBL3287293,,,,ZINC28014085,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(O)c1csc(N2CCc3cccc(C(=O)Nc4nc5ccccc5s4)c3C...,9200.0,Kd (nM)
1,P10415,46836568,,CHEMBL3287301,,,,ZINC28010479,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(Nc1nc2ccccc2s1)c1cccc2c1CN(c1nc(C(=O)O)c(C...,4400.0,Kd (nM)
2,P10415,52946911,,CHEMBL1270616,,,,ZINC64512357,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCCC(O)(c1ccc(Cl)cc1)c1ccc(Cl)cc1,20000.0,Kd (nM)
3,P10415,52946912,,CHEMBL1270617,,,,ZINC64513836,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCCC(O)(c1ccccc1)c1ccc(Cl)cc1,80000.0,Kd (nM)
4,P10415,845175,,CHEMBL1269479,,,,ZINC04785567,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCCC(O)(c1ccccc1)c1ccccc1,200000.0,Kd (nM)
5,P10415,2710,,CHEMBL22150,DB08936,,,ZINC19362737,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1,60000.0,Kd (nM)
6,P10415,52946715,,CHEMBL1269480,,,,ZINC64539135,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,NCC(O)(c1ccccc1)c1ccc(F)cc1Cl,250000.0,Kd (nM)
7,P10415,2782689,,CHEMBL106708,DB07108,,,ZINC02382451,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(O)c1ccc(-c2ccc(F)cc2)cc1,400000.0,Kd (nM)
8,P10415,2782689,,CHEMBL106708,DB07108,,,ZINC02382451,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,O=C(O)c1ccc(-c2ccc(F)cc2)cc1,430000.0,Kd (nM)
9,P10415,20679058,,CHEMBL1269503,,,,ZINC64512768,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CCc1cccc(-c2ccc(C(=O)O)cc2)c1,100000.0,Kd (nM)


In [85]:
test_data = bindingdb_blc2_kd.iloc[:, [8, 9]].values
test_labels = log_transform(check_afordable_range(bindingdb_blc2_kd.iloc[:, 10].values))
print("test data", test_data.shape)
print("test labels", test_labels.shape)

test data (26, 2)
test labels (26,)


In [86]:
test_dataset = test_Dataset(test_data)
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

#### 6.1 train davis test

In [87]:
Test_module = Test(davis_regressor, test_loader)
y_pred_davis = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_davis)

print(f"\t[Val] total samples: {len(y_pred_davis)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 26, RMSE: 2.240, PCC: -0.392, CI: 0.389, r2: -0.323


#### 6.2 train kiba test

In [88]:
Test_module = Test(kiba_regressor, test_loader)
y_pred_kiba = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_kiba)

print(f"\t[Val] total samples: {len(y_pred_kiba)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 26, RMSE: 6.553, PCC: 0.277, CI: 0.627, r2: -10.323


#### 6.3 train glass test

In [89]:
Test_module = Test(glass_regressor, test_loader)
y_pred_glass = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_glass)

print(f"\t[Val] total samples: {len(y_pred_glass)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 26, RMSE: 1.413, PCC: 0.761, CI: 0.772, r2: 0.474


#### 6.4 train bindingdb test

In [90]:
Test_module = Test(bindingdb_regressor, test_loader)
y_pred_bindingdb = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_bindingdb)

print(f"\t[Val] total samples: {len(y_pred_bindingdb)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 26, RMSE: 0.487, PCC: 0.970, CI: 0.907, r2: 0.937


#### 6.5 train merged test

In [91]:
Test_module = Test(merged_regressor, test_loader)
y_pred_merged = Test_module.predict()

RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred_merged)

print(f"\t[Val] total samples: {len(y_pred_merged)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 26, RMSE: 0.641, PCC: 0.946, CI: 0.929, r2: 0.892


### 2. Merged BRAF

In [34]:
merged_braf = pd.concat([davis_braf, bindingdb_braf_ki, bindingdb_braf_kd], ignore_index=True)
merged_braf = merged_braf[["Protein", "PubChem CID", "ChEBI ID of Ligand", "ChEMBL ID of Ligand", "DrugBank ID of Ligand", "IUPHAR_GRAC ID of Ligand", 
                             "KEGG ID of Ligand", "ZINC ID of Ligand", "Sequence", "SMILES", "Value", "Parameter"]]

merged_braf

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Value,Parameter
0,BRAF,11314340.0,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,10000.0,Kd (nM)
1,BRAF,24889392.0,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3cn4c(n3)sc3cc(OCC...,10000.0,Kd (nM)
2,BRAF,11409972.0,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCN1CCN(Cc2ccc(NC(=O)Nc3ccc(Oc4cc(NC)ncn4)cc3)...,1700.0,Kd (nM)
3,BRAF,11338033.0,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,O=C(NC1CCNCC1)c1[nH]ncc1NC(=O)c1c(Cl)cccc1Cl,10000.0,Kd (nM)
4,BRAF,10184653.0,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,10000.0,Kd (nM)
...,...,...,...,...,...,...,...,...,...,...,...,...
452,P15056,,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCC(C)C(NC(=O)C(CC(N)=O)NC(=O)C(NC(=O)C(Cc1cnc...,5750.0,Kd (nM)
453,P15056,,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCC(C)C(NC(=O)C(CC(N)=O)NC(=O)C(NC(=O)C(Cc1cnc...,550.0,Kd (nM)
454,P15056,,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CCC(C)C(NC(=O)C(CC(C)C)NC(=O)C(NC(=O)C(CC(N)=O...,4100.0,Kd (nM)
455,P15056,,,,,,,,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,CSCCC1NC(=O)C(Cc2ccccc2)NC(=O)C(CC(C)C)NC(=O)C...,310.0,Kd (nM)


In [35]:
merged_braf_groups = merged_braf.groupby(["Sequence", "SMILES"])
print(f"\t> groupby: {len(merged_braf_groups.groups)}")

	> groupby: 360


### 3. Merged BCL2

In [19]:
merged_bcl2 = pd.concat([bindingdb_blc2_ki, bindingdb_blc2_kd], ignore_index=True)
merged_bcl2

Unnamed: 0,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand,Sequence,SMILES,Value,Parameter
0,P10415,11556440.0,,CHEMBL192571,,,,ZINC14976048,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,67.0,Ki (nM)
1,P10415,16109064.0,,,,,,ZINC29464760,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,COC1(Cc2ccccc2)CCN(c2ccc(C(=O)NS(=O)(=O)c3ccc(...,8.1,Ki (nM)
2,P10415,91895979.0,,,,,,ZINC95612739,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,35.2,Ki (nM)
3,P10415,91895980.0,,,,,,ZINC95612906,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,6.5,Ki (nM)
4,P10415,16109115.0,,,,,,ZINC95612691,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CC1OC2(CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC(CCN(C...,9.6,Ki (nM)
...,...,...,...,...,...,...,...,...,...,...,...,...
1719,P10415,78099415.0,,CHEMBL2322027,,,,,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)N[C+]2[N-]...,14.0,Kd (nM)
1720,P10415,71527840.0,,CHEMBL2322026,,,,ZINC95584296,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)Nc2nnc3cc(...,8.0,Kd (nM)
1721,P10415,11228183.0,,CHEMBL376408,,,,ZINC94303099,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CN(C)CCC(CSc1ccccc1)Nc1ccc(S(=O)(=O)NC(=O)c2cc...,0.6,Kd (nM)
1722,P10415,127045927.0,,,,,,,MAHAGRTGYDNREIVMKYIHYKLSQRGYEWDAGDVGAAPPGAAPAP...,CC(C)Oc1ccc(C(=O)Nc2c(C(=O)O)nn(-c3ccc(Oc4cccc...,290.0,Kd (nM)


In [24]:
merged_bcl2_groups = merged_bcl2.groupby(["Sequence", "SMILES"])
print(f"\t> groupby: {len(merged_bcl2_groups.groups)}")

for i in merged_bcl2_groups.groups:
    
    index = merged_bcl2_groups.groups[i]
    
    protein = i[0]
    compound = i[1]
    
    values = merged_bcl2.iloc[index, 10].values
    print(values)
    try:
        np.mean(values)
    except:
        print(values)

	> groupby: 1113
[4.53 4.53]
[1.157 1.16 ]
[3.484 3.484]
[5400.]
[4200.]
[3100.]
[0.04 0.04]
[192000.]
[260000.]
[250.  35.]
[170.]
[1530.]
[4470.]
[1450.]
[2080.]
[1730.]
[230.]
[190.  31.]
[21200.]
[176.183]
[743.178]
[9490.]
[3780.]
[830.]
[3390.]
[9.567 9.567]
[511.56 511.56]
[5.603 5.603]
[3.283 3.283]
[5.578 5.578]
[177.064]
[1820.]
[2920.]
[70000.]
[115000.]
[830.]
[670.]
[510.]
[930.]
[680.]
[930.]
[2880.]
[146.59 146.59]
[580.]
[620.]
[5200.]
[800.]
[700.]
[1500.]
[5.626 5.626]
[40.]
[110.]
[16100.]
[1200.]
[150.]
[470.]
[170.]
[1460.]
[ 2400. 43400.]
[1950.]
[57800.]
[7430.]
[920.]
[1400.]
[0.307]
[0.064]
[0.084]
[0.046 0.046]
[0.027 0.027]
[0.139 0.139]
[0.035 0.035]
[179.]
[5250.]
[932.]
[7000.]
[140. 290.]
[590.]
[3100.]
[24100. 31900.]
[4.551 4.551]
[0.391 0.391]
[385.]
[1113.]
[252.]
[24.]
[56.]
[290.]
[0.105 0.105]
[49000.]
[460.]
[2.]
[0.037 0.037]
[0.039 0.039]
[0.036 0.036]
[0.127 0.127]
[0.077 0.077]
[0.439 0.439]
[0.012 0.012]
[0.081 0.081]
[0.017 0.017]
[0.083 0.0

[90.]
[100.]
[2850.]
[14200. 14200.]
[21200.]
[4300.]
[1700.]
[4300.]
[21000.]
[320000.]
[6110.]
[8820.]
[23830.  2900.]
[7560.]
[7850.]
[23840.]
[54650.]
[960.]
[2700.]
[1100.]
[4400.]
[3700.]
[7300.]
[9900.]
[11700.]
[16000.]
[5700.]
[3100.]
[11700.]
[5600.]
[3600.]
[400000. 430000.]
[200.]
[220.]
[610.]
[4300.]
[9600.]
[3100.]
[9000.]
[9200.]
[4300.]
[960.]
[9700.]
[335. 235.]
[400. 230.]
[243000.]
[161000.]
[1.3]
[474760.]
[0.428 0.428]
[132.]
[87.]
[71.]
[1700.]
[ 583. 1325.]
[1150.]
[420.]
[13000.]
[9282.]
[690.]
[286.]
[959.]
[4600.]


In [12]:
tmp_ki = merged_bcl2.iloc[:, 10].values
tmp_kd = merged_bcl2.iloc[:, 11].values

for i, j in zip(tmp_ki, tmp_kd):
    print(i, j)

67.0 nan
8.1 nan
35.2 nan
6.5 nan
9.6 nan
3.4 nan
39.8 nan
61.9 nan
6.1 nan
300.0 nan
56.0 nan
16.0 nan
20.0 nan
6.6 nan
1.8 nan
2.7 nan
3.1 nan
1.4 nan
1.6 nan
1.5 nan
2.5 nan
0.7 nan
2.1 nan
2.0 nan
9.4 nan
2.0 nan
13.0 nan
46.0 nan
68.0 nan
4.5 nan
3.2 nan
4.6 nan
3.5 nan
1.0 nan
16.9 nan
0.03 nan
0.526 nan
28.343000000000004 nan
0.174 nan
11.834000000000001 nan
8.354 nan
31.467 nan
0.8270000000000001 nan
2.474 nan
0.746 nan
0.787 nan
2.592 nan
3.451 nan
0.754 nan
0.72 nan
0.171 nan
0.331 nan
1.621 nan
0.079 nan
0.586 nan
3.0389999999999997 nan
5.577999999999999 nan
2.487 nan
1.679 nan
3.965 nan
14.054 nan
5.455 nan
8.27 nan
14.984000000000002 nan
1.501 nan
0.511 nan
2.2119999999999997 nan
1.3259999999999998 nan
0.903 nan
0.071 nan
2.0069999999999997 nan
1.584 nan
7.172999999999999 nan
0.049 nan
0.022000000000000002 nan
0.07 nan
0.005 nan
0.013000000000000001 nan
0.019 nan
0.019 nan
0.027000000000000003 nan
0.03 nan
0.034 nan
0.036000000000000004 nan
0.047 nan
0.047 nan
0.05 nan
0.0

In [11]:
merged_bcl2_groups = merged_bcl2.groupby(["Sequence", "SMILES"])
print(f"\t> groupby: {len(merged_bcl2_groups.groups)}")

	> groupby: 1113
