In [None]:
import pandas as pd
from json import load, dump
from dataset import process_BindingDB, get_cliffs, split_data

In [None]:
df = pd.read_csv('/mnt/data/BindingDB_All.tsv', sep = '\t', on_bad_lines='skip')
df.head()

In [None]:
X_drug, X_SMILES, X_target, y = process_BindingDB('/mnt/data/BindingDB_All.tsv',
                                         y = 'Ki', 
                                         binary = False, 
                                         convert_to_log = True)

In [None]:
bindingdb_d_t_ki = pd.DataFrame({'drug': X_drug, 'SMILES': X_SMILES, 'target': X_target, 'Ki': y})

In [None]:
bindingdb_d_t_ki.to_csv('../analysis/bindingdb_d_t_ki.csv', index=False)

In [None]:
data_ki= pd.read_csv('../analysis/bindingdb_d_t_ki.csv')

In [None]:
data_ki.head()

In [None]:
data_ki.columns = 'drug', 'SMILES', 'target', 'affinity'

In [None]:
data_ki.head()

In [None]:
cliff_pairs = get_cliffs(data_ki, threshold_affinity=1, threshold_similarity=0.9)

In [None]:
cliff_pairs

In [None]:
cliff_pairs_no_dup = cliff_pairs.drop_duplicates()

In [None]:
cliff_pairs.shape

In [None]:
cliff_pairs_no_dup.shape

In [None]:
#cliff_pairs_no_dup.to_csv('../analysis/bindindb_ki_cliff_pairs_ta1_ts0.9.csv', index=False)

# Split data for DDC task

In [None]:
cliff_pairs_no_dup =  pd.read_csv('../analysis/bindindb_ki_cliff_pairs_ta1_ts0.9.csv')

In [None]:
target_ids = {t:i for i, t in enumerate(set(cliff_pairs_no_dup.target))}
dump(target_ids, open('../analysis/target_mapping_bdb.json', 'w'))

In [None]:
target_ids = load(open('../analysis/target_mapping_bdb.json'))
cliff_pairs_no_dup.loc[:, 'target'] = cliff_pairs_no_dup['target'].apply(target_ids.__getitem__)

In [None]:
cliff_pairs_split_random = split_data(cliff_pairs_no_dup, split='random')
cliff_pairs_split_random.to_csv('../analysis/bindingdb_ki_cliff_pairs_ta1_ts0.9_r_wt.csv', index=False)

In [None]:
cliff_pairs_split_random['cliff'].value_counts()

In [None]:
cliff_pairs_split_cb = split_data(cliff_pairs_no_dup, split='compound-based')
cliff_pairs_split_cb.to_csv('../analysis/bindingdb_ki_cliff_pairs_ta1_ts0.9_cb_wt.csv', index=False)

In [None]:
cliff_pairs_split_cb['cliff'].value_counts()

In [None]:
cliff_pairs_split_random.head()

In [None]:
cliff_pairs_split_cb.head()

In [None]:
cliff_pairs_split_random = pd.read_csv('../analysis/bindingdb_ki_cliff_pairs_ta1_ts0.9_r_wt.csv')

In [None]:
cliff_pairs_split_random['target'].nunique()

In [None]:
cliff_pairs_split_random

# Split data for DTI task

In [None]:
data_ki= pd.read_csv('../analysis/bindingdb_d_t_ki.csv')

In [None]:
data_ki.head()

In [None]:
with open('../analysis/target_mapping_bdb.json', 'r') as f:
    target_ids = load(f)

# Get the highest current ID to ensure new IDs are unique
current_max_id = max(target_ids.values())

# Function to get or create target ID
def get_or_create_target_id(target):
    global current_max_id
    if target in target_ids:
        return target_ids[target]
    else:
        current_max_id += 1
        target_ids[target] = current_max_id
        return current_max_id

In [None]:
data_ki['target'] = data_ki['target'].apply(get_or_create_target_id)


In [None]:
data_ki = data_ki.rename(columns={'SMILES': 'smiles', 'Ki': 'affinity'})

In [None]:
data_ki.head()

In [None]:
data_ki['target'].nunique()

In [None]:
data_ki_aff_split = split_data(data_ki, 'random')

In [None]:
data_ki_aff_split

In [None]:
data_ki_aff_split.to_csv('../analysis/bindingdb_ki_d_t_aff_smiles_split-2.csv', index=False)

In [None]:
data_ki_aff_split.split.value_counts()