In [2]:
import pandas as pd
import numpy as np
import pickle
import torch
from module.RNN import *
from module.DNN import *
from module.helpers import *

In [3]:
def split_data(data, frac, seed):
    
    train_frac, val_frac, test_frac = frac
    test = data.sample(frac = test_frac, replace = False, random_state = seed)
    
    train_val = data[~data.index.isin(test.index)]
    val = train_val.sample(frac = val_frac/(1-test_frac), replace = False, random_state = seed)
    train = train_val[~train_val.index.isin(val.index)]
    
    return train.reset_index(drop = True), val.reset_index(drop = True), test.reset_index(drop = True)

### 1. Davis dataset

#### 1.1 Data load

In [4]:
interactions_file_path = "./data/davis_data.tsv"  
interactions_data = pd.read_csv(interactions_file_path, sep = "\t")
print(interactions_data.shape)
interactions_data.head()

(30056, 5)


Unnamed: 0,Sequence,SMILES,pKa,Protein,Compound
0,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,Cc1[nH]nc2ccc(-c3cncc(OCC(N)Cc4ccccc4)c3)cc12,7.37,AAK1,11314340
1,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3cn4c(n3)sc3cc(OCC...,5.0,AAK1,24889392
2,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,CCN1CCN(Cc2ccc(NC(=O)Nc3ccc(Oc4cc(NC)ncn4)cc3)...,5.0,AAK1,11409972
3,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,O=C(NC1CCNCC1)c1[nH]ncc1NC(=O)c1c(Cl)cccc1Cl,5.0,AAK1,11338033
4,MKKFFDSRREQGGSGLGSGSSGGGGSTSGLGSGYIGRVFGIGRQQV...,CN(C)CC=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc...,5.0,AAK1,10184653


In [5]:
train_data, val_data, test_data = split_data(interactions_data, frac = [0.7,0.1,0.2], seed = 0)

#### 1.2 Model load

In [16]:
USE_CUDA = True
GPU_NUM = 2

if USE_CUDA:
    device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(device)
    print ('Current cuda device ', torch.cuda.current_device())

Current cuda device  2


In [17]:
model_path = "./model/train_davis.pth"  

In [18]:
regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

In [19]:
with open("./model/Sequence_voca.pkl", "rb") as f:
    Protein_voca = pickle.load(f)

with open("./model/SMILES_voca.pkl", "rb") as f:
    SMILES_voca = pickle.load(f)

#### 1.3 Run validation

In [20]:
val_dataset = test_Dataset(val_data.iloc[:, [0, 1]].values)
val_labels = val_data.iloc[:, 2].values
val_loader = DataLoader(dataset = val_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [21]:
Test_module = Test(regressor, val_loader)
y_pred = Test_module.predict()

In [22]:
RMSE, PCC, CI, r2 = get_regression_result(val_labels, y_pred)

In [23]:
print(f"\t[Val] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 3006, RMSE: 0.501, PCC: 0.832, CI: 0.880, r2: 0.687


#### 1.4 Run test

In [13]:
test_dataset = test_Dataset(test_data.iloc[:,[0, 1]].values)
test_labels = test_data.iloc[:, 2].values
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [14]:
Test_module = Test(regressor, test_loader)
y_pred = Test_module.predict()

In [15]:
RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred)

In [16]:
print(f"\t[Test] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Test] total samples: 6011, RMSE: 0.509, PCC: 0.824, CI: 0.877, r2: 0.669


### 2. KIBA dataset

#### 2.1 Data load

In [17]:
interactions_file_path = "./data/kiba_data.tsv"    
interactions_data = pd.read_csv(interactions_file_path, sep = "\t")
print(interactions_data.shape)
interactions_data.head()

(118254, 5)


Unnamed: 0,Sequence,SMILES,pKa,Protein,Compound
0,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,11.1,O00141,CHEMBL1087421
1,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,COc1cc2c(cc1Cl)C(c1cccc(Cl)c1)=NCC2,11.1,O00141,CHEMBL1088633
2,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,12.1,O00141,CHEMBL1090360
3,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,Nc1nccc(-c2ccc3c(N)n[nH]c3c2)n1,11.1,O00141,CHEMBL1688215
4,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,CNc1cncc(-c2c[nH]c(=O)c(NC(=O)c3ccc(N4CCCC4CN4...,12.1,O00141,CHEMBL1765781


In [18]:
train_data, val_data, test_data = split_data(interactions_data, frac = [0.7,0.1,0.2], seed = 0)

#### 2.2 Model load

In [19]:
USE_CUDA = True
GPU_NUM = 2

if USE_CUDA:
    device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(device)
    print ('Current cuda device ', torch.cuda.current_device())

Current cuda device  2


In [20]:
model_path = "./model/train_kiba.pth"  

In [21]:
regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

In [22]:
with open("./model/Sequence_voca.pkl", "rb") as f:
    Protein_voca = pickle.load(f)

with open("./model/SMILES_voca.pkl", "rb") as f:
    SMILES_voca = pickle.load(f)

#### 2.3 Run validation

In [23]:
val_dataset = test_Dataset(val_data.iloc[:, [0, 1]].values)
val_labels = val_data.iloc[:, 2].values
val_loader = DataLoader(dataset = val_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [24]:
Test_module = Test(regressor, val_loader)
y_pred = Test_module.predict()

In [25]:
RMSE, PCC, CI, r2 = get_regression_result(val_labels, y_pred)

In [26]:
print(f"\t[Val] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 11825, RMSE: 0.413, PCC: 0.873, CI: 0.880, r2: 0.761


#### 2.4 Run test

In [27]:
test_dataset = test_Dataset(test_data.iloc[:,[0, 1]].values)
test_labels = test_data.iloc[:, 2].values
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [28]:
Test_module = Test(regressor, test_loader)
y_pred = Test_module.predict()

In [29]:
RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred)

In [30]:
print(f"\t[Test] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Test] total samples: 23651, RMSE: 0.406, PCC: 0.873, CI: 0.879, r2: 0.762


### 3. GLASS dataset

#### 3.1 Data load

In [41]:
interactions_file_path = "./data/glass_data.tsv"  
interactions_data = pd.read_csv(interactions_file_path, sep = "\t")
print(interactions_data.shape)
interactions_data.head()

(140011, 5)


Unnamed: 0,Sequence,SMILES,pKa,Protein,Compound
0,AIAAVITFLILFTIFGNALVILAVLTSRSLRAPQNLFLVSLAAADI...,Brc1c(NC2=NCCN2)ccc2nccnc12,7.66,O77715,2435
1,AIAAVITFLILFTIFGNALVILAVLTSRSLRAPQNLFLVSLAAADI...,CC(C)=CCOc1ccc(Cl)c2c1CCN(C)CC2,7.31,O77715,122295
2,AIAAVITFLILFTIFGNALVILAVLTSRSLRAPQNLFLVSLAAADI...,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21,7.5,O77715,2726
3,AIAAVITFLILFTIFGNALVILAVLTSRSLRAPQNLFLVSLAAADI...,CN1CC(Cn2ccnc2-c2ccccc2)CC2c3cccc4[nH]c(Br)c(c...,9.37,O77715,10344262
4,AIAAVITFLILFTIFGNALVILAVLTSRSLRAPQNLFLVSLAAADI...,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,9.74,O77715,8969


In [42]:
train_data, val_data, test_data = split_data(interactions_data, frac = [0.7,0.1,0.2], seed = 0)

#### 3.2 Model load

In [43]:
USE_CUDA = True
GPU_NUM = 2

if USE_CUDA:
    device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(device)
    print ('Current cuda device ', torch.cuda.current_device())

Current cuda device  2


In [44]:
model_path = "./model/train_glass.pth" 

In [45]:
regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

In [46]:
with open("./model/Sequence_voca.pkl", "rb") as f:
    Protein_voca = pickle.load(f)

with open("./model/SMILES_voca.pkl", "rb") as f:
    SMILES_voca = pickle.load(f)

#### 3.3 Run validation

In [47]:
val_dataset = test_Dataset(val_data.iloc[:, [0, 1]].values)
val_labels = val_data.iloc[:, 2].values
val_loader = DataLoader(dataset = val_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [48]:
Test_module = Test(regressor, val_loader)
y_pred = Test_module.predict()

In [49]:
RMSE, PCC, CI, r2 = get_regression_result(val_labels, y_pred)

In [50]:
print(f"\t[Val] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 14001, RMSE: 0.769, PCC: 0.807, CI: 0.807, r2: 0.649


#### 3.4 Run test

In [51]:
test_dataset = test_Dataset(test_data.iloc[:,[0, 1]].values)
test_labels = test_data.iloc[:, 2].values
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [52]:
Test_module = Test(regressor, test_loader)
y_pred = Test_module.predict()

In [53]:
RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred)

In [54]:
print(f"\t[Test] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Test] total samples: 28002, RMSE: 0.779, PCC: 0.805, CI: 0.806, r2: 0.645


### 4. BindingDB

#### 4.1 Data load

In [55]:
interactions_file_path = "./data/bindingdb_data.tsv"    
interactions_data = pd.read_csv(interactions_file_path, sep = "\t")
print(interactions_data.shape)
interactions_data.head()

(281215, 11)


Unnamed: 0,Sequence,SMILES,pKa,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand
0,AASGVATNTPTANDEEYITPVTIGGTTLNLNFDTGSADLWVFSTEL...,CCOC(=O)CP(=O)(O)C(CC(C)C)NC(=O)C(NC(=O)C(NC(=...,4.7,P00798,44461503,,CHEMBL82364,,,,ZINC26490045
1,AASGVATNTPTANDEEYITPVTIGGTTLNLNFDTGSADLWVFSTEL...,CNC(=O)C(F)(F)C(O)(O)C(CC(C)C)NC(=O)C(NC(=O)C(...,6.0,P00798,44461580,,CHEMBL309445,,,,ZINC26498109
2,AASGVATNTPTANDEEYITPVTIGGTTLNLNFDTGSADLWVFSTEL...,CNC(=O)C(F)(F)C(O)C(CC(C)C)NC(=O)C(NC(=O)C(NC(...,5.0,P00798,44461589,,CHEMBL313988,,,,ZINC36177642
3,AASGVATNTPTANDEEYITPVTIGGTTLNLNFDTGSADLWVFSTEL...,COC(=O)C(Cc1ccccc1)OP(=O)(O)C(CC(C)C)NC(=O)C(C...,4.38,P00798,44461743,,CHEMBL309930,,,,
4,AASGVATNTPTANDEEYITPVTIGGTTLNLNFDTGSADLWVFSTEL...,COC(=O)C(Cc1ccccc1)OP(=O)(O)C(CC(C)C)NC(=O)C(N...,5.55,P00798,44461788,,CHEMBL314391,,,,ZINC26501082


In [56]:
train_data, val_data, test_data = split_data(interactions_data, frac = [0.7,0.1,0.2], seed = 0)

#### 4.2 Model load

In [57]:
USE_CUDA = True
GPU_NUM = 2

if USE_CUDA:
    device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(device)
    print ('Current cuda device ', torch.cuda.current_device())

Current cuda device  2


In [58]:
model_path = "./model/train_bindingdb.pth"   

In [59]:
regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

In [60]:
with open("./model/Sequence_voca.pkl", "rb") as f:
    Protein_voca = pickle.load(f)

with open("./model/SMILES_voca.pkl", "rb") as f:
    SMILES_voca = pickle.load(f)

#### 4.3 Run validation

In [61]:
val_dataset = test_Dataset(val_data.iloc[:, [0, 1]].values)
val_labels = val_data.iloc[:, 2].values
val_loader = DataLoader(dataset = val_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [62]:
Test_module = Test(regressor, val_loader)
y_pred = Test_module.predict()

In [63]:
RMSE, PCC, CI, r2 = get_regression_result(val_labels, y_pred)

In [64]:
print(f"\t[Val] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 28122, RMSE: 0.830, PCC: 0.829, CI: 0.816, r2: 0.685


#### 4.4 Run test

In [65]:
test_dataset = test_Dataset(test_data.iloc[:,[0, 1]].values)
test_labels = test_data.iloc[:, 2].values
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [66]:
Test_module = Test(regressor, test_loader)
y_pred = Test_module.predict()

In [67]:
RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred)

In [68]:
print(f"\t[Test] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Test] total samples: 56243, RMSE: 0.827, PCC: 0.828, CI: 0.816, r2: 0.684


### 5. Merged dataset

#### 5.1 Data load

In [3]:
interactions_file_path = "./data/merged_data.tsv" 
interactions_data = pd.read_csv(interactions_file_path, sep = "\t")
print(interactions_data.shape)
interactions_data.head() 

(331187, 11)


Unnamed: 0,Sequence,SMILES,pKa,Protein,PubChem CID,ChEBI ID of Ligand,ChEMBL ID of Ligand,DrugBank ID of Ligand,IUPHAR_GRAC ID of Ligand,KEGG ID of Ligand,ZINC ID of Ligand
0,AARDSGTGGGSEKMRGSGPRGAGRRRPPSGGGDTPITPASLAGCYS...,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1,6.15,EPHA5,176870,,,,,,
1,AARDSGTGGGSEKMRGSGPRGAGRRRPPSGGGDTPITPASLAGCYS...,C=CC(=O)Nc1cc2c(Nc3ccc(F)c(Cl)c3)ncnc2cc1OCCCN...,5.92,EPHA5,156414,,,,,,
2,AARDSGTGGGSEKMRGSGPRGAGRRRPPSGGGDTPITPASLAGCYS...,CC(C)(C)c1cc(NC(=O)Nc2ccc(-c3cn4c(n3)sc3cc(OCC...,5.0,EPHA5,24889392,,,,,,
3,AARDSGTGGGSEKMRGSGPRGAGRRRPPSGGGDTPITPASLAGCYS...,CC(C)(C)c1cnc(CSc2cnc(NC(=O)C3CCNCC3)s2)o1,5.0,EPHA5,3025986,,,,,,
4,AARDSGTGGGSEKMRGSGPRGAGRRRPPSGGGDTPITPASLAGCYS...,CC(C)N1NC(=C2C=c3cc(O)ccc3=N2)c2c(N)ncnc21,5.51,EPHA5,25243800,,,,,,


In [4]:
train_data, val_data, test_data = split_data(interactions_data, frac = [0.7,0.1,0.2], seed = 0)

#### 5.2 Model load

In [5]:
USE_CUDA = True
GPU_NUM = 2

if USE_CUDA:
    device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(device)
    print ('Current cuda device ', torch.cuda.current_device())

Current cuda device  2


In [6]:
model_path = "./model/train_merged.pth" 

In [7]:
regressor = load_checkpoint_eval(model_path, USE_CUDA, device)

In [8]:
with open("./model/Sequence_voca.pkl", "rb") as f:
    Protein_voca = pickle.load(f)

with open("./model/SMILES_voca.pkl", "rb") as f:
    SMILES_voca = pickle.load(f)

#### 5.3 Run validation

In [9]:
val_dataset = test_Dataset(val_data.iloc[:, [0, 1]].values)
val_labels = val_data.iloc[:, 2].values
val_loader = DataLoader(dataset = val_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [10]:
Test_module = Test(regressor, val_loader)
y_pred = Test_module.predict()

In [11]:
RMSE, PCC, CI, r2 = get_regression_result(val_labels, y_pred)

In [12]:
print(f"\t[Val] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Val] total samples: 33119, RMSE: 0.799, PCC: 0.848, CI: 0.831, r2: 0.716


#### 5.4 Run test

In [13]:
test_dataset = test_Dataset(test_data.iloc[:,[0, 1]].values)
test_labels = test_data.iloc[:, 2].values
test_loader = DataLoader(dataset = test_dataset, batch_size = 400, collate_fn = Mycall(Protein_voca, SMILES_voca, USE_CUDA))

In [14]:
Test_module = Test(regressor, test_loader)
y_pred = Test_module.predict()

In [15]:
RMSE, PCC, CI, r2 = get_regression_result(test_labels, y_pred)

In [16]:
print(f"\t[Test] total samples: {len(y_pred)}, RMSE: {RMSE:.3f}, PCC: {PCC:.3f}, CI: {CI:.3f}, r2: {r2:.3f}")

	[Test] total samples: 66237, RMSE: 0.805, PCC: 0.844, CI: 0.831, r2: 0.710
