In [65]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score


In [66]:
targets = ["target_CHEMBL240",
          "target_CHEMBL260",
          "target_CHEMBL279"]

methods = ["RF", "xgboost", "MLP"]


sorting = [True, False]

sorting_vars = ["logp"] #, "pchembl"]

druglike = [True, False]

In [67]:



# Define a function to calculate metrics
def calculate_metrics(group):
    total_compounds = len(group)
    low_error_count = sum(abs(group['True Values'] - group['Predictions']) <= 0.5)
    discovery_yield = low_error_count / total_compounds if total_compounds > 0 else 0
    return pd.Series({
        'Total Ideal Compounds': total_compounds,
        'Low Error Count': low_error_count,
        'Discovery Yield': discovery_yield
    })



In [68]:
all_results = pd.DataFrame()

# Loop through each configuration
for target in targets:
    for var in sorting_vars:
        for method in methods:
            print("sortedFCV")
            print(target)
            print(method)
            print(var)

            # Load the DataFrame from a CSV file
            df = pd.read_csv(f"../{target}-1_results/{var}_sorted_fcv_results_{method}_testing batches.csv")
    
            if (target == "target_CHEMBL240"):
                disc_yld = df[df["True Values"] < 5.2]
                
            if (target != "target_CHEMBL240"):
                disc_yld = df[df["True Values"] > 7.0]
            
            # Group by Testing Batch and apply the function
            batch_metrics = disc_yld.groupby('testing batch').apply(calculate_metrics).reset_index()
            # Calculate the maximum number of compounds in any batch
            max_compounds = batch_metrics['Total Ideal Compounds'].max()

            # Standardize the discovery yield
            batch_metrics['Standardized Discovery Yield'] = batch_metrics['Discovery Yield'] * (max_compounds / batch_metrics['Total Ideal Compounds'])
            batch_metrics['Target'] = target
            batch_metrics['Variable'] = var
            batch_metrics['Method'] = method
            batch_metrics['evalt'] = "sorted_fcv"

            # Append results
            all_results = pd.concat([all_results, batch_metrics], ignore_index=True)


# Optionally, save all results to a single CSV file
all_results.to_csv('dy_sorted_fcv_discovery_yield_results.csv', index=False)
all_results

sortedFCV
target_CHEMBL240
RF
logp
sortedFCV
target_CHEMBL240
xgboost
logp
sortedFCV
target_CHEMBL240
MLP
logp
sortedFCV
target_CHEMBL260
RF
logp
sortedFCV
target_CHEMBL260
xgboost
logp
sortedFCV
target_CHEMBL260
MLP
logp
sortedFCV
target_CHEMBL279
RF
logp
sortedFCV
target_CHEMBL279
xgboost
logp
sortedFCV
target_CHEMBL279
MLP
logp


Unnamed: 0,testing batch,Total Ideal Compounds,Low Error Count,Discovery Yield,Standardized Discovery Yield,Target,Variable,Method,evalt
0,1,21.0,14.0,0.666667,2.539683,target_CHEMBL240,logp,RF,sorted_fcv
1,2,37.0,23.0,0.621622,1.344047,target_CHEMBL240,logp,RF,sorted_fcv
2,3,24.0,8.0,0.333333,1.111111,target_CHEMBL240,logp,RF,sorted_fcv
3,4,43.0,18.0,0.418605,0.778799,target_CHEMBL240,logp,RF,sorted_fcv
4,5,44.0,23.0,0.522727,0.950413,target_CHEMBL240,logp,RF,sorted_fcv
...,...,...,...,...,...,...,...,...,...
76,5,73.0,32.0,0.438356,0.630512,target_CHEMBL279,logp,MLP,sorted_fcv
77,6,69.0,32.0,0.463768,0.705734,target_CHEMBL279,logp,MLP,sorted_fcv
78,7,71.0,30.0,0.422535,0.624876,target_CHEMBL279,logp,MLP,sorted_fcv
79,8,78.0,25.0,0.320513,0.431460,target_CHEMBL279,logp,MLP,sorted_fcv


In [69]:
all_results = pd.DataFrame()

# Loop through each configuration
for target in targets:
    for method in methods:
        print("sortedFCV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        df = pd.read_csv(f"../{target}-1_results/unsorted_fcv_results_{method}_testing batches.csv")

        if (target == "target_CHEMBL240"):
            disc_yld = df[df["True Values"] < 5.2]

        if (target != "target_CHEMBL240"):
            disc_yld = df[df["True Values"] > 7.0]

        # Group by Testing Batch and apply the function
        batch_metrics = disc_yld.groupby('testing batch').apply(calculate_metrics).reset_index()
        # Calculate the maximum number of compounds in any batch
        max_compounds = batch_metrics['Total Ideal Compounds'].max()

        # Standardize the discovery yield
        batch_metrics['Standardized Discovery Yield'] = batch_metrics['Discovery Yield'] * (max_compounds / batch_metrics['Total Ideal Compounds'])
        batch_metrics['Target'] = target
        batch_metrics['Method'] = method
        batch_metrics['evalt'] = "unsorted_fcv"

        # Append results
        all_results = pd.concat([all_results, batch_metrics], ignore_index=True)


# Optionally, save all results to a single CSV file
all_results.to_csv('dy_unsorted_fcv_discovery_yield_results.csv', index=False)
all_results

sortedFCV
target_CHEMBL240
RF
logp
sortedFCV
target_CHEMBL240
xgboost
logp
sortedFCV
target_CHEMBL240
MLP
logp
sortedFCV
target_CHEMBL260
RF
logp
sortedFCV
target_CHEMBL260
xgboost
logp
sortedFCV
target_CHEMBL260
MLP
logp
sortedFCV
target_CHEMBL279
RF
logp
sortedFCV
target_CHEMBL279
xgboost
logp
sortedFCV
target_CHEMBL279
MLP
logp


Unnamed: 0,testing batch,Total Ideal Compounds,Low Error Count,Discovery Yield,Standardized Discovery Yield,Target,Method,evalt
0,1,39.0,26.0,0.666667,0.974359,target_CHEMBL240,RF,unsorted_fcv
1,2,57.0,39.0,0.684211,0.684211,target_CHEMBL240,RF,unsorted_fcv
2,3,49.0,31.0,0.632653,0.735943,target_CHEMBL240,RF,unsorted_fcv
3,4,48.0,21.0,0.437500,0.519531,target_CHEMBL240,RF,unsorted_fcv
4,5,33.0,20.0,0.606061,1.046832,target_CHEMBL240,RF,unsorted_fcv
...,...,...,...,...,...,...,...,...
76,5,88.0,48.0,0.545455,0.582645,target_CHEMBL279,MLP,unsorted_fcv
77,6,87.0,56.0,0.643678,0.695468,target_CHEMBL279,MLP,unsorted_fcv
78,7,86.0,40.0,0.465116,0.508383,target_CHEMBL279,MLP,unsorted_fcv
79,8,82.0,50.0,0.609756,0.698989,target_CHEMBL279,MLP,unsorted_fcv


In [77]:
all_results = pd.DataFrame()

# Loop through each configuration
for target in targets:
    for method in methods:
        print("sortedFCV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        df = pd.read_csv(f"../{target}-1_results/cv_results_{method}_testing batches.csv")

        if (target == "target_CHEMBL240"):
            disc_yld = df[df["True Values"] < 5.2]

        if (target != "target_CHEMBL240"):
            disc_yld = df[df["True Values"] > 7.0]

        # Group by Testing Batch and apply the function
        batch_metrics = disc_yld.groupby('testing batch').apply(calculate_metrics).reset_index()
        # Calculate the maximum number of compounds in any batch
        max_compounds = batch_metrics['Total Ideal Compounds'].max()

        # Standardize the discovery yield
        batch_metrics['Standardized Discovery Yield'] = batch_metrics['Discovery Yield'] * (max_compounds / batch_metrics['Total Ideal Compounds'])
        batch_metrics['Target'] = target
        batch_metrics['Method'] = method
        batch_metrics['evalt'] = "cv"

        # Append results
        all_results = pd.concat([all_results, batch_metrics], ignore_index=True)


# Optionally, save all results to a single CSV file
all_results.to_csv('dy_cv_discovery_yield_results.csv', index=False)
all_results

sortedFCV
target_CHEMBL240
RF
logp
sortedFCV
target_CHEMBL240
xgboost
logp
sortedFCV
target_CHEMBL240
MLP
logp
sortedFCV
target_CHEMBL260
RF
logp
sortedFCV
target_CHEMBL260
xgboost
logp
sortedFCV
target_CHEMBL260
MLP
logp
sortedFCV
target_CHEMBL279
RF
logp
sortedFCV
target_CHEMBL279
xgboost
logp
sortedFCV
target_CHEMBL279
MLP
logp


Unnamed: 0,testing batch,Total Ideal Compounds,Low Error Count,Discovery Yield,Standardized Discovery Yield,Target,Method,evalt
0,1,51.0,40.0,0.784314,0.845829,target_CHEMBL240,RF,cv
1,2,41.0,31.0,0.756098,1.014277,target_CHEMBL240,RF,cv
2,3,55.0,40.0,0.727273,0.727273,target_CHEMBL240,RF,cv
3,4,52.0,36.0,0.692308,0.732249,target_CHEMBL240,RF,cv
4,5,51.0,42.0,0.823529,0.888120,target_CHEMBL240,RF,cv
...,...,...,...,...,...,...,...,...
85,6,82.0,48.0,0.585366,0.678168,target_CHEMBL279,MLP,cv
86,7,86.0,53.0,0.616279,0.680773,target_CHEMBL279,MLP,cv
87,8,89.0,53.0,0.595506,0.635652,target_CHEMBL279,MLP,cv
88,9,87.0,51.0,0.586207,0.640111,target_CHEMBL279,MLP,cv


In [78]:
all_results = pd.DataFrame()

# Loop through each configuration
for target in targets:
    for method in methods:
        print("sortedFCV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        df = pd.read_csv(f"../{target}-1_results/scaffold_cv_results_{method}_testing batches.csv")

        if (target == "target_CHEMBL240"):
            disc_yld = df[df["True Values"] < 5.2]

        if (target != "target_CHEMBL240"):
            disc_yld = df[df["True Values"] > 7.0]

        # Group by Testing Batch and apply the function
        batch_metrics = disc_yld.groupby('testing batch').apply(calculate_metrics).reset_index()
        # Calculate the maximum number of compounds in any batch
        max_compounds = batch_metrics['Total Ideal Compounds'].max()

        # Standardize the discovery yield
        batch_metrics['Standardized Discovery Yield'] = batch_metrics['Discovery Yield'] * (max_compounds / batch_metrics['Total Ideal Compounds'])
        batch_metrics['Target'] = target
        batch_metrics['Method'] = method
        batch_metrics['evalt'] = "scaffold_cv"

        # Append results
        all_results = pd.concat([all_results, batch_metrics], ignore_index=True)


# Optionally, save all results to a single CSV file
all_results.to_csv('dy_scaffold_cv_discovery_yield_results.csv', index=False)
all_results

sortedFCV
target_CHEMBL240
RF
logp
sortedFCV
target_CHEMBL240
xgboost
logp
sortedFCV
target_CHEMBL240
MLP
logp
sortedFCV
target_CHEMBL260
RF
logp
sortedFCV
target_CHEMBL260
xgboost
logp
sortedFCV
target_CHEMBL260
MLP
logp
sortedFCV
target_CHEMBL279
RF
logp
sortedFCV
target_CHEMBL279
xgboost
logp
sortedFCV
target_CHEMBL279
MLP
logp


Unnamed: 0,testing batch,Total Ideal Compounds,Low Error Count,Discovery Yield,Standardized Discovery Yield,Target,Method,evalt
0,1,20.0,5.0,0.250000,0.787500,target_CHEMBL240,RF,scaffold_cv
1,2,27.0,20.0,0.740741,1.728395,target_CHEMBL240,RF,scaffold_cv
2,3,58.0,39.0,0.672414,0.730380,target_CHEMBL240,RF,scaffold_cv
3,4,63.0,46.0,0.730159,0.730159,target_CHEMBL240,RF,scaffold_cv
4,5,36.0,20.0,0.555556,0.972222,target_CHEMBL240,RF,scaffold_cv
...,...,...,...,...,...,...,...,...
85,6,80.0,35.0,0.437500,0.612500,target_CHEMBL279,MLP,scaffold_cv
86,7,94.0,44.0,0.468085,0.557718,target_CHEMBL279,MLP,scaffold_cv
87,8,80.0,40.0,0.500000,0.700000,target_CHEMBL279,MLP,scaffold_cv
88,9,79.0,40.0,0.506329,0.717834,target_CHEMBL279,MLP,scaffold_cv


In [79]:

dy_sortedforwardCV = pd.read_csv("dy_sorted_fcv_discovery_yield_results.csv")
dy_unsortedforwardCV = pd.read_csv("dy_unsorted_fcv_discovery_yield_results.csv")
dy_CV = pd.read_csv("dy_cv_discovery_yield_results.csv")
dy_scaffold_CV = pd.read_csv("dy_scaffold_cv_discovery_yield_results.csv")

# Concatenating the DataFrames
combined_metrics = pd.concat([dy_sortedforwardCV, dy_unsortedforwardCV, dy_CV, dy_scaffold_CV], axis=0)

# Optional: Reset index if you want a clean, sequential index in the new DataFrame
combined_metrics.reset_index(drop=True, inplace=True)

# Save the concatenated DataFrame to a CSV file
combined_metrics.to_csv('combined_discovery_yield.csv', index=False)

# Display the DataFrame to verify the contents
combined_metrics


Unnamed: 0,testing batch,Total Ideal Compounds,Low Error Count,Discovery Yield,Standardized Discovery Yield,Target,Variable,Method,evalt
0,1,21.0,14.0,0.666667,2.539683,target_CHEMBL240,logp,RF,sorted_fcv
1,2,37.0,23.0,0.621622,1.344047,target_CHEMBL240,logp,RF,sorted_fcv
2,3,24.0,8.0,0.333333,1.111111,target_CHEMBL240,logp,RF,sorted_fcv
3,4,43.0,18.0,0.418605,0.778799,target_CHEMBL240,logp,RF,sorted_fcv
4,5,44.0,23.0,0.522727,0.950413,target_CHEMBL240,logp,RF,sorted_fcv
...,...,...,...,...,...,...,...,...,...
337,6,80.0,35.0,0.437500,0.612500,target_CHEMBL279,,MLP,scaffold_cv
338,7,94.0,44.0,0.468085,0.557718,target_CHEMBL279,,MLP,scaffold_cv
339,8,80.0,40.0,0.500000,0.700000,target_CHEMBL279,,MLP,scaffold_cv
340,9,79.0,40.0,0.506329,0.717834,target_CHEMBL279,,MLP,scaffold_cv


In [81]:
combined_metrics.evalt.value_counts()

cv              90
scaffold_cv     90
sorted_fcv      81
unsorted_fcv    81
Name: evalt, dtype: int64