In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from json import load
from dataset import get_cliffs, random_split_data, compound_based_split

In [None]:
# Load target sequences and drugs from respective JSON files
targets = load(open('../data/KIBA/target_seq.txt'))
drugs = load(open('../data/KIBA/SMILES.txt'))

# Load drug-target affinity data from a CSV file into a DataFrame
affinity = pd.read_csv('../data/KIBA/affinity.txt', sep='\\t', header=None, engine='python')

In [None]:
# Map drug ids to indices (rows) and target ids to columns
affinity.index = drugs
affinity.columns = targets

In [None]:
unpivoted = affinity.stack().reset_index()
unpivoted.columns = ['drug', 'target', 'affinity']
unpivoted['SMILES'] = unpivoted.drug.apply(drugs.get)

In [None]:
#unpivoted.to_csv('../analysis/kiba_d_t_affinity.csv', index=False)
#unpivoted = pd.read_csv('../analysis/kiba_d_t_affinity.csv')

In [None]:
# Histogram of Ki values
plt.figure(figsize=(10, 6))
sns.histplot(unpivoted['affinity'], bins=50, kde=True)
plt.title('Distribution of KIBA Values')
plt.xlabel('KIBA Value')
plt.ylabel('Frequency')
plt.show()

# Random split

## DDC

In [None]:
cliff_pairs = get_cliffs(unpivoted, threshold_affinity=1, threshold_similarity=0.9, task='classification')

In [None]:
cliff_pairs.to_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9.csv', index=False)

In [None]:
cliff_pairs.drop_duplicates()

In [None]:
# Creating IDs for targets
#target_ids = {t:i for i, t in enumerate(set(cliff_pairs.target))}

# Save the target_ids dictionary to a JSON file for later use
#json.dump(target_ids, open('../analysis/target_mapping.json', 'w'))

# Load the target_ids dictionary from the JSON file
target_ids = load(open('../analysis/target_mapping.json'))

# Map each target to its corresponding ID using the target_ids dictionary
cliff_pairs['target'] = cliff_pairs['target'].apply(target_ids.__getitem__)

In [None]:
# Split data randomly 
cliff_pairs_split_random = random_split_data(cliff_pairs)

#cliff_pairs_split_random.to_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9_r_wt.csv', index=False)

In [None]:
cliff_pairs_split_random = pd.read_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9_r_wt.csv')

## DTI

In [None]:
aff = pd.read_csv('../analysis/kiba_d_t_affinity.csv')

In [None]:
aff.head()

In [None]:
aff.columns

In [None]:
# Assign or create IDs for targets
with open('../analysis/target_mapping.json', 'r') as f:
    target_ids = load(f)

# Get the highest current ID to ensure new IDs are unique
current_max_id = max(target_ids.values())

# Function to get or create target ID
def get_or_create_target_id(target):
    global current_max_id
    if target in target_ids:
        return target_ids[target]
    else:
        current_max_id += 1
        target_ids[target] = current_max_id
        return current_max_id

In [None]:
aff['target'] = aff['target'].apply(get_or_create_target_id)

In [None]:
# random split
aff_split = random_split_data(aff)

In [None]:
aff_split.to_csv('../analysis/kiba_d_t_aff_smiles_split.csv', index=False)

In [None]:
aff_split = pd.read_csv('../analysis/kiba_d_t_aff_smiles_split.csv')

# Compound-based split

In [None]:
unpivoted = pd.read_csv('../analysis/kiba_d_t_affinity.csv')

In [None]:
unpivoted.head()

In [None]:
target_ids = load(open('../analysis/target_mapping.json'))
unpivoted['target'] = unpivoted['target'].apply(target_ids.__getitem__)

## DTI

In [27]:
unpivoted.head()

Unnamed: 0,drug,target,affinity,SMILES
0,CHEMBL1087421,86,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl
1,CHEMBL1087421,174,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl
2,CHEMBL1087421,95,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl
3,CHEMBL1087421,183,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl
4,CHEMBL1087421,202,11.1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl


In [None]:
drugs_split = compound_based_split(unpivoted)

In [None]:
drugs_split_nd = drugs_split.drop_duplicates()

In [None]:
drugs_split_nd.to_csv('../analysis/kiba_dti_cb_split.csv', index=False)

## DDC

In [None]:
train = drugs_split_nd[drugs_split_nd['split'] == 0]
validation = drugs_split_nd[drugs_split_nd['split'] == 1]
test = drugs_split_nd[drugs_split_nd['split'] == 2]

In [None]:
cliff_pairs_tr = get_cliffs(train, threshold_affinity=1, threshold_similarity=0.9, task='classification')
cliff_pairs_val = get_cliffs(validation, threshold_affinity=1, threshold_similarity=0.9, task='classification')
cliff_pairs_test = get_cliffs(test, threshold_affinity=1, threshold_similarity=0.9, task='classification')

In [None]:
cliff_pairs_tr['split'] = 0
cliff_pairs_val['split'] = 1
cliff_pairs_test['split'] = 2

cliff_pairs_all = pd.concat([cliff_pairs_tr, cliff_pairs_val,cliff_pairs_test], axis=0)