In [None]:
import numpy as np
import pandas as pd
from cliffs import get_similarity_matrix
from json import load
from sklearn import metrics

In [None]:
# Load target sequences and drugs from respective JSON files
targets = load(open('../data/KIBA/target_seq.txt'))
drugs = load(open('../data/KIBA/SMILES.txt'))

In [None]:
# Change the location of the file to process
unpivoted = pd.read_csv('../analysis/morgan_cnn_kiba_predictions.csv')

unpivoted.columns = ['drug', 'target', 'affinity', 'predicted']
unpivoted

In [None]:
groups = []
threshold_affinity = 1
threshold_similarity = .9

# Loop through each group in the DataFrame 'unpivoted' grouped by 'target'
# 'g_name' holds the name of the target, 'group' contains the corresponding rows (related to specific target)
for g_name, group in unpivoted.groupby('target', sort=False):
    # Calculate the similarity matrix for the drug molecules (related to specific target)
    sim = get_similarity_matrix([drugs[x] for x in group.drug], similarity=threshold_similarity)
    # Find non-zero elements in the similarity matrix, indicating pairs of similar drugs
    i, j = sim.nonzero()

    # Select corresponding rows from the affinity DataFrame for these drug pairs d1-d2
    # 'd1' and 'd2' represent the first and second drug in the pair respectively
    d1 = group.iloc[i]
    d2 = group.iloc[j]
    
    # Calculate the 1x difference in affinity between the two drugs (KIBA values)
    affinity_diff = np.abs(d1.affinity.values - d2.affinity.values) > threshold_affinity
    
    # Select rows from d1 and d2 where the affinity difference is significant
    cliff1 = d1.iloc[affinity_diff]
    cliff2 = d2.iloc[affinity_diff]
    
    # Pair up corresponding rows from cliff1 and cliff2 side by side
    paired = pd.concat([cliff1.reset_index(drop=True), cliff2.reset_index(drop=True)], axis=1)
    paired.columns = ['drug1', 'target', 'affinity1', 'predicted1', 'drug2', 'remove', 'affinity2', 'predicted2']
    paired = paired[['target', 'drug1', 'drug2', 'affinity1', 'affinity2', 'predicted1', 'predicted2']].copy()
    groups.append(paired)

# Concatenate all group DataFrames into a single DataFrame
groups = pd.concat(groups)

In [None]:
groups

In [None]:
# Compare group 1 and 2
# Calculate the R-squared and RMSE between the actual and predicted affinity values of group 1
# Name: metrics_groupnumber_thresholdaffinity

r2_score_1_ta1 = metrics.r2_score(groups.affinity1, groups.predicted1)
mse_1_ta1 = metrics.mean_squared_error(groups.affinity1, groups.predicted1, squared=False)
r2_score_1_ta1, mse_1_ta1 

In [None]:
# Calculate the R-squared and RMSE between the actual and predicted affinity values of group 2
r2_score_2_ta1 = metrics.r2_score(groups.affinity2, groups.predicted2)
mse_2_ta1 = metrics.mean_squared_error(groups.affinity2, groups.predicted2, squared=False)
r2_score_2_ta1, mse_2_ta1 

In [None]:
# Add metrics to the file
df = pd.read_csv('../analysis/morgan_cnn_kiba_performance.csv')
df['Value_1_ta1'] = r2_score_1_ta1, mse_1_ta1 
df['Value_2_ta1'] = r2_score_2_ta1, mse_2_ta1 


In [None]:
df.to_csv('../analysis/morgan_cnn_kiba_performance.csv', index=False)

In [None]:
# Check of the models recognizes the compounds
metrics.r2_score(groups.affinity1, groups.predicted2), metrics.mean_squared_error(groups.affinity1, groups.predicted2, squared=False)

In [None]:
metrics.r2_score(groups.affinity2, groups.predicted1), metrics.mean_squared_error(groups.affinity2, groups.predicted1, squared=False)

In [None]:
# threshold_affinity = 2
groups_2 = groups.loc[(groups.affinity1 - groups.affinity2).abs() >= 2]
groups_2

In [None]:
r2_score_1_ta2 = metrics.r2_score(groups_2.affinity1, groups_2.predicted1)
mse_1_ta2 = metrics.mean_squared_error(groups_2.affinity1, groups_2.predicted1, squared=False)
r2_score_1_ta2, mse_1_ta2 

In [None]:
r2_score_2_ta2 = metrics.r2_score(groups_2.affinity2, groups_2.predicted2)
mse_2_ta2 = metrics.mean_squared_error(groups_2.affinity2, groups_2.predicted2, squared=False)
r2_score_2_ta2, mse_2_ta2 

In [None]:
groups_3 = groups.loc[(groups.affinity1 - groups.affinity2).abs() >= 3]
groups_3


In [None]:
r2_score_1_ta3 = metrics.r2_score(groups_3.affinity1, groups_3.predicted1)
mse_1_ta3 = metrics.mean_squared_error(groups_3.affinity1, groups_3.predicted1, squared=False)
r2_score_1_ta3, mse_1_ta3 

In [None]:
r2_score_2_ta3 = metrics.r2_score(groups_3.affinity2, groups_3.predicted2)
mse_2_ta3 = metrics.mean_squared_error(groups_3.affinity2, groups_3.predicted2, squared=False)
r2_score_2_ta3, mse_2_ta3 

In [None]:
groups_4 = groups.loc[(groups.affinity1 - groups.affinity2).abs() >= 4]
groups_4
# only 18 compounds

In [None]:
r2_score_1_ta4 = metrics.r2_score(groups_4.affinity1, groups_4.predicted1)
mse_1_ta4 = metrics.mean_squared_error(groups_4.affinity1, groups_4.predicted1, squared=False)
r2_score_1_ta4, mse_1_ta4 

In [None]:
r2_score_2_ta4 = metrics.r2_score(groups_4.affinity2, groups_4.predicted2)
mse_2_ta4 = metrics.mean_squared_error(groups_4.affinity2, groups_4.predicted2, squared=False)
r2_score_2_ta4, mse_2_ta4 

In [None]:
groups.loc[(groups.affinity1 - groups.affinity2).abs() >= 5]
# no compounds


In [None]:
df = pd.read_csv('../analysis/morgan_cnn_kiba_performance.csv')
df['Value_1_ta2'] = r2_score_1_ta2, mse_1_ta2 
df['Value_2_ta2'] = r2_score_2_ta2, mse_2_ta2 
df['Value_1_ta3'] = r2_score_1_ta3, mse_1_ta3
df['Value_2_ta3'] = r2_score_2_ta3, mse_2_ta3 
df['Value_1_ta4'] = r2_score_1_ta4, mse_1_ta4
df['Value_2_ta4'] = r2_score_2_ta4, mse_2_ta4 
df.to_csv('../analysis/morgan_cnn_kiba_performance.csv', index=False)

In [None]:
df

In [None]:
df.transpose()

In [None]:
df.transpose().to_csv('../analysis/morgan_cnn_kiba_performance.csv', index=True, header=None)