In [1]:
import pandas as pd
from json import load, dump
from dataset import process_BindingDB, get_cliffs, random_split_data, compound_based_split

import matplotlib.pyplot as plt
import seaborn as sns

# Data preprocessing

In [None]:
df = pd.read_csv('/mnt/data/BindingDB_All.tsv', sep = '\t', on_bad_lines='skip')
df.head()

In [None]:
X_drug, X_SMILES, X_target, y = process_BindingDB('../data/BindingDB_All.tsv',
                                         y = 'Ki',
                                         binary = False,
                                         convert_to_log = True)

In [None]:
bindingdb_d_t_ki = pd.DataFrame({'drug': X_drug, 'SMILES': X_SMILES, 'target': X_target, 'Ki': y})

In [None]:
bindingdb_d_t_ki.to_csv('../analysis/bindingdb_d_t_ki.csv', index=False)

In [None]:
data_ki= pd.read_csv('../analysis/bindingdb_d_t_ki.csv')

In [None]:
data_ki.columns = 'drug', 'SMILES', 'target', 'affinity'

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data_ki['affinity'], bins=50, kde=True)
plt.title('Distribution of affinity values')
plt.xlabel('log(Ki)')
plt.ylabel('Frequency')
plt.show()

# Random split

## DDC

In [None]:
cliff_pairs = get_cliffs(data_ki, threshold_affinity=1, threshold_similarity=0.9)

In [None]:
cliff_pairs

In [None]:
cliff_pairs_no_dup = cliff_pairs.drop_duplicates()

In [None]:
cliff_pairs.shape

In [None]:
cliff_pairs_no_dup.shape

In [None]:
#cliff_pairs_no_dup.to_csv('../analysis/bindindb_ki_cliff_pairs_ta1_ts0.9.csv', index=False)

In [None]:
cliff_pairs_no_dup =  pd.read_csv('../analysis/bindindb_ki_cliff_pairs_ta1_ts0.9.csv')

In [None]:
cliff_pairs_no_dup.shape

In [None]:
target_ids = {t:i for i, t in enumerate(set(cliff_pairs_no_dup.target))}
dump(target_ids, open('../analysis/target_mapping_bdb.json', 'w'))

In [None]:
target_ids = load(open('../analysis/target_mapping_bdb.json'))
cliff_pairs_no_dup.loc[:, 'target'] = cliff_pairs_no_dup['target'].apply(target_ids.__getitem__)

In [None]:
cliff_pairs_split_random = random_split_data(cliff_pairs_no_dup)
cliff_pairs_split_random.to_csv('../analysis/bindingdb_ki_cliff_pairs_ta1_ts0.9_r_wt.csv', index=False)

In [None]:
cliff_pairs_split_random['cliff'].value_counts()

## DTI 

In [2]:
data_ki= pd.read_csv('../analysis/bindingdb_d_t_ki.csv')

In [3]:
data_ki.head()

Unnamed: 0,drug,SMILES,target,Ki
0,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,Dimer of Gag-Pol polyprotein [501-599],9.60206
1,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,Dimer of Gag-Pol polyprotein [501-599],9.387216
2,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,Dimer of Gag-Pol polyprotein [501-599],9.09691
3,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,Dimer of Gag-Pol polyprotein [501-599],9.004365
4,"(4R,5S,6S,7R)-4,7-dibenzyl-1-butyl-3-(cyclopro...",CCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H](...,Dimer of Gag-Pol polyprotein [501-599],8.958607


In [None]:
data_ki = data_ki.rename(columns={'Ki': 'affinity'})

In [None]:
plt.hist(data_ki['affinity'])
plt.show()

In [None]:
data_ki.head()

In [None]:
with open('../analysis/target_mapping_bdb.json', 'r') as f:
    target_ids = load(f)

# Get the highest current ID to ensure new IDs are unique
current_max_id = max(target_ids.values())

# Function to get or create target ID
def get_or_create_target_id(target):
    global current_max_id
    if target in target_ids:
        return target_ids[target]
    else:
        current_max_id += 1
        target_ids[target] = current_max_id
        return current_max_id

In [None]:
data_ki['target'] = data_ki['target'].apply(get_or_create_target_id)

In [None]:
data_ki['target'].nunique()

In [None]:
data_ki_aff_split = random_split_data(data_ki)

In [None]:
data_ki_aff_split

In [18]:
data_ki_aff_split.to_csv('../analysis/bdb_dti_r_split.csv', index=False)

In [15]:
data_ki_aff_split = pd.read_csv('../analysis/bdb_dti_r_split.csv.csv')

In [12]:
data_ki_aff_split = data_ki_aff_split.rename(columns={'smiles': 'SMILES'})

In [16]:
data_ki_aff_split.head()

Unnamed: 0,drug,SMILES,target,affinity,split
0,(risperidone)3-{2-[4-(6-Fluoro-benzo[d]isoxazo...,Cc1nc2CCCCn2c(=O)c1CCN1CCC(CC1)c1noc2cc(F)ccc12,620,9.522879,0
1,4-Amino-3-fluorobenzenesulfonamide::4-amino-3-...,Nc1ccc(cc1F)S(N)(=O)=O,182,7.221849,0
2,5-(4-chloro-3-methylphenyl)-1-[(4-methylphenyl...,Cc1ccc(Cn2nc(cc2-c2ccc(Cl)c(C)c2)C(=O)NC2[C@@]...,821,5.0,0
3,"2-{[(2-chlorophenyl)methyl](3,5-dichloro-2-hyd...",CC(N(Cc1ccccc1Cl)S(=O)(=O)c1cc(Cl)cc(Cl)c1O)C(...,89,5.619789,0
4,AcNH-4-NO2-Phe-c[D-Cys-Tyr-D-Trp-Lys-Thr-Cys]-...,CC(C)[C@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@@H](Cc2c...,332,7.0,0


# Compound-based split

In [None]:
data_ki = pd.read_csv('../analysis/bindingdb_d_t_ki.csv')

In [None]:
data_ki = data_ki.rename(columns={'Ki': 'affinity'})

In [None]:
data_ki = data_ki.drop_duplicates()

In [None]:
target_ids = {t:i for i, t in enumerate(set(data_ki.target))}
dump(target_ids, open('../analysis/target_mapping_bdb.json', 'w'))

In [None]:
target_ids = load(open('../analysis/target_mapping_bdb.json'))
data_ki.loc[:, 'target'] = data_ki['target'].apply(target_ids.__getitem__)

## DTI

In [None]:
bdb_drug_split = compound_based_split(data_ki)

In [None]:
bdb_drug_split.to_csv('../analysis/bdb_dti_cb_split.csv', index=False)

In [None]:
bdb_drug_split.head()

In [None]:
train = bdb_drug_split[bdb_drug_split['split'] == 0]
validation = bdb_drug_split[bdb_drug_split['split'] == 1]
test = bdb_drug_split[bdb_drug_split['split'] == 2]

In [None]:
# Check if there is an overlap
# Extract unique values from the 'drug' column for each dataset
train_drugs = set(train['drug'].unique())
validation_drugs = set(validation['drug'].unique())
test_drugs = set(test['drug'].unique())

# Find intersections to check for overlaps
train_validation_overlap = train_drugs.intersection(validation_drugs)
train_test_overlap = train_drugs.intersection(test_drugs)
validation_test_overlap = validation_drugs.intersection(test_drugs)

# Check and print if there are overlaps
if train_validation_overlap:
    print("There are overlaps between train and validation datasets.")
    print("Overlapping drugs:", train_validation_overlap)
else:
    print("No overlaps between train and validation datasets.")

if train_test_overlap:
    print("There are overlaps between train and test datasets.")
    print("Overlapping drugs:", train_test_overlap)
else:
    print("No overlaps between train and test datasets.")

if validation_test_overlap:
    print("There are overlaps between validation and test datasets.")
    print("Overlapping drugs:", validation_test_overlap)
else:
    print("No overlaps between validation and test datasets.")


## DDC

In [None]:
cliff_pairs_tr = get_cliffs(train, threshold_affinity=1, threshold_similarity=0.9)
cliff_pairs_val = get_cliffs(validation, threshold_affinity=1, threshold_similarity=0.9)
cliff_pairs_test = get_cliffs(test, threshold_affinity=1, threshold_similarity=0.9)

In [None]:
cliff_pairs_tr['split'] = 0
cliff_pairs_val['split'] = 1
cliff_pairs_test['split'] = 2

cliff_pairs_all = pd.concat([cliff_pairs_tr, cliff_pairs_val,cliff_pairs_test], axis=0)

In [None]:
cliff_pairs_all.to_csv('../analysis/bdb_ddc_cb_ta1_ts0.9.csv', index=False)