In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from json import load
from dataset import get_cliffs, random_split_data, compound_based_split

In [None]:
# Load target sequences and drugs from respective JSON files
targets = load(open('../data/KIBA/target_seq.txt'))
drugs = load(open('../data/KIBA/SMILES.txt'))

# Load drug-target affinity data from a CSV file into a DataFrame
affinity = pd.read_csv('../data/KIBA/affinity.txt', sep='\\t', header=None, engine='python')

In [None]:
# Map drug ids to indices (rows) and target ids to columns
affinity.index = drugs
affinity.columns = targets

In [None]:
unpivoted = affinity.stack().reset_index()
unpivoted.columns = ['drug', 'target', 'affinity']
unpivoted['SMILES'] = unpivoted.drug.apply(drugs.get)

In [None]:
#unpivoted.to_csv('../analysis/kiba_d_t_affinity.csv', index=False)
#unpivoted = pd.read_csv('../analysis/kiba_d_t_affinity.csv')

In [2]:
unpivoted = pd.read_csv('../analysis/kiba_d_t_affinity.csv')
unpivoted.head()

Unnamed: 0,drug,target,affinity,SMILES
0,CHEMBL1087421,O00141,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl
1,CHEMBL1087421,O14920,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl
2,CHEMBL1087421,O15111,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl
3,CHEMBL1087421,P00533,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl
4,CHEMBL1087421,P04626,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl


In [None]:
# Histogram of Ki values
plt.figure(figsize=(10, 6))
sns.histplot(unpivoted['affinity'], bins=50, kde=True)
plt.title('Distribution of KIBA Values')
plt.xlabel('KIBA Value')
plt.ylabel('Frequency')
plt.show()

# Random split

## DDC

In [None]:
cliff_pairs = get_cliffs(unpivoted, threshold_affinity=1, threshold_similarity=0.9)

In [None]:
cliff_pairs.to_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9.csv', index=False)

In [None]:
cliff_pairs.drop_duplicates()

In [None]:
# Creating IDs for targets
#target_ids = {t:i for i, t in enumerate(set(cliff_pairs.target))}

# Save the target_ids dictionary to a JSON file for later use
#json.dump(target_ids, open('../analysis/target_mapping_kiba.json', 'w'))

# Load the target_ids dictionary from the JSON file
target_ids = load(open('../analysis/target_mapping_kiba.json'))

# Map each target to its corresponding ID using the target_ids dictionary
cliff_pairs['target'] = cliff_pairs['target'].apply(target_ids.__getitem__)

In [None]:
# Split data randomly 
cliff_pairs_split_random = random_split_data(cliff_pairs)

#cliff_pairs_split_random.to_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9_r_wt.csv', index=False)

In [None]:
#cliff_pairs_split_random = pd.read_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9_r_wt.csv')

## DTI

In [4]:
aff = pd.read_csv('../analysis/kiba_d_t_affinity.csv')

In [None]:
aff.head()

In [None]:
aff.columns

In [5]:
# Assign or create IDs for targets
with open('../analysis/target_mapping_kiba.json', 'r') as f:
    target_ids = load(f)

# Get the highest current ID to ensure new IDs are unique
current_max_id = max(target_ids.values())

# Function to get or create target ID
def get_or_create_target_id(target):
    global current_max_id
    if target in target_ids:
        return target_ids[target]
    else:
        current_max_id += 1
        target_ids[target] = current_max_id
        return current_max_id

In [6]:
aff['target'] = aff['target'].apply(get_or_create_target_id)

In [7]:
# random split
aff_split = random_split_data(aff)

In [None]:
aff_split.to_csv('../analysis/kiba_dti_r_split.csv', index=False)

In [2]:
aff_split = pd.read_csv('../analysis/kiba_dti_r_split.csv')

In [8]:
aff_split.head()

Unnamed: 0,drug,target,affinity,SMILES,split
69964,CHEMBL592030,5,11.5,C1=CC=C2C(=C1)C=C(S2)C3=C4C(=CC(=C3)C5=CN(C6=C...,0
90427,CHEMBL1965033,61,11.699999,CC1=CC(=O)NN=C1C2=CC=C(C=C2)NC(=O)NC3=C(C=CC(=...,0
16899,CHEMBL1970314,114,12.399998,C1=CC(=CC(=C1)N)C2=CC(=C3C(=C2)C=CC=N3)C(=O)N,0
41492,CHEMBL1991188,199,12.499997,CC(C1=CC(=CC=C1)OC)NC(=O)C2=C(C=C(C=C2)C3=C(C=...,0
80248,CHEMBL1970083,111,14.700326,C1=CC(=CC=C1C2=CSC3=C2C(=NC=C3C4=CC=NC=C4)N)NC...,0


# Compound-based split

In [None]:
unpivoted = pd.read_csv('../analysis/kiba_d_t_affinity.csv')

In [None]:
unpivoted.head()

In [None]:
# Creating IDs for targets
#target_ids = {t:i for i, t in enumerate(set(cliff_pairs.target))}

# Save the target_ids dictionary to a JSON file for later use
#json.dump(target_ids, open('../analysis/target_mapping.json', 'w'))

In [None]:
# Load the target_ids dictionary from the JSON file
target_ids = load(open('../analysis/target_mapping_kiba.json'))

# Map each target to its corresponding ID using the target_ids dictionary
unpivoted['target'] = unpivoted['target'].apply(target_ids.__getitem__)

## DTI

In [None]:
drugs_split = compound_based_split(unpivoted)

In [None]:
drugs_split_nd = drugs_split.drop_duplicates()

In [None]:
drugs_split_nd.to_csv('../analysis/kiba_dti_cb_split.csv', index=False)

In [36]:
drugs_split_nd.head()

Unnamed: 0,drug,target,affinity,SMILES,split
0,CHEMBL1087421,86,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,0
1,CHEMBL1087421,174,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,0
2,CHEMBL1087421,95,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,0
3,CHEMBL1087421,183,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,0
4,CHEMBL1087421,202,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,0


In [None]:
train = drugs_split_nd[drugs_split_nd['split'] == 0]
validation = drugs_split_nd[drugs_split_nd['split'] == 1]
test = drugs_split_nd[drugs_split_nd['split'] == 2]

## DDC

In [None]:
cliff_pairs_tr = get_cliffs(train, threshold_affinity=1, threshold_similarity=0.9)
cliff_pairs_val = get_cliffs(validation, threshold_affinity=1, threshold_similarity=0.9)
cliff_pairs_test = get_cliffs(test, threshold_affinity=1, threshold_similarity=0.9)

In [None]:
cliff_pairs_tr['split'] = 0
cliff_pairs_val['split'] = 1
cliff_pairs_test['split'] = 2

cliff_pairs_all = pd.concat([cliff_pairs_tr, cliff_pairs_val,cliff_pairs_test], axis=0)

In [26]:
cliff_pairs_all.to_csv('../analysis/kiba_ddc_cb_ta1_ts0.9.csv', index=False)

In [34]:
cliff_pairs_all['target'].nunique()

219

In [35]:
cliff_pairs_all

Unnamed: 0,drug1,drug2,smiles1,smiles2,cliff,target,split
0,CHEMBL1087421,CHEMBL1088633,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=CC=C3)Cl)Cl,0,86,0
1,CHEMBL206783,CHEMBL207037,COC1=C(C=C2C(=C1)N=CN=C2NC3=C(C(=CC=C3)Cl)F)CN...,COC1=C(C=C2C(=C1)N=CN=C2NC3=C(C(=CC=C3)Cl)F)CN...,0,86,0
2,CHEMBL206783,CHEMBL207246,COC1=C(C=C2C(=C1)N=CN=C2NC3=C(C(=CC=C3)Cl)F)CN...,COC1=C(C=C2C(=C1)N=CN=C2NC3=C(C(=CC=C3)Cl)F)CN...,0,86,0
3,CHEMBL206783,CHEMBL207584,COC1=C(C=C2C(=C1)N=CN=C2NC3=C(C(=CC=C3)Cl)F)CN...,COC1=C(C=C2C(=C1)N=CN=C2NC3=C(C(=CC=C3)Cl)F)CN...,0,86,0
4,CHEMBL206783,CHEMBL207674,COC1=C(C=C2C(=C1)N=CN=C2NC3=C(C(=CC=C3)Cl)F)CN...,COC1=C(C=C2C(=C1)N=CN=C2NC3=C(C(=CC=C3)Cl)F)CN...,0,86,0
...,...,...,...,...,...,...,...
4352,CHEMBL338449,CHEMBL444337,CC12C(C(CC(O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=...,CC12C(C(CC(O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=...,0,27,2
4353,CHEMBL338967,CHEMBL405405,C1C(C(C(C(O1)N2C3=C(C=CC(=C3)O)C4=C5C(=C6C7=C(...,C1C(C(C(OC1N2C3=C(C=CC(=C3)O)C4=C5C(=C6C7=C(C=...,0,27,2
4354,CHEMBL347195,CHEMBL347684,COC1C(OC(C(C1OC(=O)CBr)O)N2C3=CC=CC=C3C4=C5C(=...,COC1C(OC(C(C1O)OC(=O)CBr)N2C3=CC=CC=C3C4=C5C(=...,0,27,2
4355,CHEMBL53606,CHEMBL53826,CCCCCCCCCCCCCCCCOCC(COCCCCCC[N+](C)(C)C)OC.[Br-],CCCCCCCCCCCCCCCCOCC(COCCCC[N+](C)(C)C)OC.[Br-],0,27,2


In [33]:
cliff_pairs_all.isnull().values.any()

False