In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from json import load
from dataset import get_cliffs, split_data
from graphics import get_hist_prop

In [None]:
# Load target sequences and drugs from respective JSON files
targets = load(open('../data/KIBA/target_seq.txt'))
drugs = load(open('../data/KIBA/SMILES.txt'))

# Load drug-target affinity data from a CSV file into a DataFrame
affinity = pd.read_csv('../data/KIBA/affinity.txt', sep='\\t', header=None)

In [None]:
# Map drug ids to indices (rows) and target ids to columns
affinity.index = drugs
affinity.columns = targets

In [None]:
unpivoted = affinity.stack().reset_index()
unpivoted.columns = ['drug', 'target', 'affinity']
unpivoted['SMILES'] = unpivoted.drug.apply(drugs.get)

In [None]:
unpivoted

In [None]:
#unpivoted.to_csv('../analysis/kiba_d_t_affinity.csv')
unpivoted = pd.read_csv('../analysis/kiba_d_t_affinity.csv')

In [None]:
unpivoted.head()

In [None]:
# Histogram of Ki values
plt.figure(figsize=(10, 6))
sns.histplot(unpivoted['affinity'], bins=50, kde=True)
plt.title('Distribution of KIBA Values')
plt.xlabel('KIBA Value')
plt.ylabel('Frequency')
plt.show()

In [None]:
cliff_pairs = get_cliffs(unpivoted, threshold_affinity=1, threshold_similarity=0.9)

In [None]:
cliff_pairs

In [None]:
cliff_pairs.to_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9.csv', index=False)

# Visualization

In [None]:
cliff_pairs = pd.read_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9.csv')

In [None]:
cliff_pivot = cliff_pairs.pivot_table(index=['drug1', 'drug2'], columns='cliff', aggfunc='size', fill_value=0)
cliff_pivot

In [None]:
cliff_pivot['total_count'] = cliff_pivot[0] + cliff_pivot[1]

# Calculate the proportion of 1s in pairs
cliff_pivot['proportion_of_1'] = cliff_pivot[1] / cliff_pivot['total_count']
cliff_pivot

In [None]:
cliff_pivot_reset = cliff_pivot.reset_index()

plt.figure(figsize=(10, 6))
plt.hist(cliff_pivot_reset['proportion_of_1'].to_numpy(), color='blue', alpha=0.5, bins=50)
plt.xlabel('Proportion of 1s')
plt.ylabel('Pairs');

In [None]:
get_hist_prop(cliff_pivot, 'proportion_of_1')

In [None]:
len(cliff_pivot_reset), len(cliff_pivot_reset[cliff_pivot_reset['proportion_of_1'] == 0]), len(cliff_pivot_reset[cliff_pivot_reset['proportion_of_1'] == 1])

In [None]:
len(cliff_pivot_reset[(cliff_pivot_reset['proportion_of_1'] != 0) & (cliff_pivot_reset['proportion_of_1'] != 1)])

Out of 2627 drug pairs, 795 do not exhibit ACs (as it was expected) and 72 pairs does show AC exclusively. Moreover, there exists 1760 of compounds that demonstrate both non-AC and AC.


# Split data for DDC task

In [None]:
cliff_pairs

In [None]:
cliff_pairs.drop_duplicates()

In [None]:
#target_ids = {t:i for i, t in enumerate(set(cliff_pairs.target))}
#json.dump(target_ids, open('../analysis/target_mapping.json', 'w'))

In [None]:
target_ids = load(open('../analysis/target_mapping.json'))
cliff_pairs['target'] = cliff_pairs['target'].apply(target_ids.__getitem__)

In [None]:
cliff_pairs_split_random = split_data(cliff_pairs, split='random')
# cliff_pairs_split_random.to_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9_r_wt.csv', index=False)

In [None]:
cliff_pairs_split_random = pd.read_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9_r_wt.csv')

In [None]:
cliff_pairs_split_random

In [None]:
cliff_pairs_split_random['cliff'].value_counts()

In [None]:
cliff_pairs_split_cb = split_data(cliff_pairs, split='compound-based')
#cliff_pairs_split_cb.to_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9_cb_wt.csv', index=False)

In [None]:
cliff_pairs_split_cb = pd.read_csv('../analysis/kiba_cliff_pairs_ta1_ts0.9_cb_wt.csv')

In [None]:
cliff_pairs_split_cb['cliff'].value_counts()/len(cliff_pairs_split_cb['cliff'])


In [None]:
cliff_pairs_split_cb.shape

In [None]:
cliff_pairs_split_cb['cliff'].value_counts()

# Split data for DTI task

In [None]:
aff = pd.read_csv('../analysis/kiba_d_t_aff_smiles.csv')

In [None]:
aff['target'].nunique()

In [None]:
aff

In [None]:
aff.shape

In [None]:
with open('../analysis/target_mapping.json', 'r') as f:
    target_ids = load(f)

# Get the highest current ID to ensure new IDs are unique
current_max_id = max(target_ids.values())

# Function to get or create target ID
def get_or_create_target_id(target):
    global current_max_id
    if target in target_ids:
        return target_ids[target]
    else:
        current_max_id += 1
        target_ids[target] = current_max_id
        return current_max_id

In [None]:
aff['target'] = aff['target'].apply(get_or_create_target_id)

In [None]:
aff_split = split_data(aff, 'random')

In [None]:
aff_split.to_csv('../analysis/kiba_d_t_aff_smiles_split.csv', index=False)

In [None]:
aff_split.split.value_counts()