In [1]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
targets = ["target_CHEMBL240",
          "target_CHEMBL260",
          "target_CHEMBL279"]

methods = ["RF", "xgboost", "MLP"]


sorting_vars = ["logp"]


In [3]:
# DataFrame to store results
results = []

for target in targets:
    for var in sorting_vars:
        for method in methods:
            
            print(target)
            print(method)
            print(var)
            
            df = pd.read_csv(f"../{target}-1_results/{var}_sorted_fcv_results_{method}_testing batches.csv")
            
            
            # Group by 'testing batch' and calculate RMSE and R^2 for each group
            for batch, group in df.groupby('testing batch'):
                rmse = np.sqrt(mean_squared_error(group['True Values'], group['Predictions']))
                r_squared = r2_score(group['True Values'], group['Predictions'])

                # Collect results in the DataFrame
                results.append({
                    'Target': target,
                    'Sorting_Var': var,
                    'Method': method,
                    'Testing Batch': batch,
                    'RMSE': rmse,
                    'R2': r_squared,
                    'evalt': 'sorted_fcv'
                })


# Convert the list of dictionaries to a DataFrame
final_results = pd.DataFrame(results)
# Save the DataFrame to a CSV file
final_results.to_csv('metrics_sortedforwardCV.csv', index=False)


target_CHEMBL240
RF
logp
target_CHEMBL240
xgboost
logp
target_CHEMBL240
MLP
logp
target_CHEMBL260
RF
logp
target_CHEMBL260
xgboost
logp
target_CHEMBL260
MLP
logp
target_CHEMBL279
RF
logp
target_CHEMBL279
xgboost
logp
target_CHEMBL279
MLP
logp


In [4]:
final_results

Unnamed: 0,Target,Sorting_Var,Method,Testing Batch,RMSE,R2,evalt
0,target_CHEMBL240,logp,RF,1,0.542261,0.330080,sorted_fcv
1,target_CHEMBL240,logp,RF,2,0.619225,-0.173668,sorted_fcv
2,target_CHEMBL240,logp,RF,3,1.110581,-0.120046,sorted_fcv
3,target_CHEMBL240,logp,RF,4,0.751111,0.457861,sorted_fcv
4,target_CHEMBL240,logp,RF,5,0.776810,0.595403,sorted_fcv
...,...,...,...,...,...,...,...
76,target_CHEMBL279,logp,MLP,5,0.863716,0.415135,sorted_fcv
77,target_CHEMBL279,logp,MLP,6,0.959923,0.232422,sorted_fcv
78,target_CHEMBL279,logp,MLP,7,0.967107,0.212932,sorted_fcv
79,target_CHEMBL279,logp,MLP,8,1.071409,-0.100560,sorted_fcv


In [5]:
# DataFrame to store results
results = []

for target in targets:
    for method in methods:

        print(target)
        print(method)
        print(var)

        df = pd.read_csv(f"../{target}-1_results/unsorted_fcv_results_{method}_testing batches.csv")
        
            
        # Group by 'testing batch' and calculate RMSE and R^2 for each group
        for batch, group in df.groupby('testing batch'):
            rmse = np.sqrt(mean_squared_error(group['True Values'], group['Predictions']))
            r_squared = r2_score(group['True Values'], group['Predictions'])

            # Collect results in the DataFrame
            results.append({
                'Target': target,
                'Sorting_Var': "None",
                'Method': method,
                'Testing Batch': batch,
                'RMSE': rmse,
                'R2': r_squared,
                'evalt': 'unsorted_fcv'
            })


# Convert the list of dictionaries to a DataFrame
final_results = pd.DataFrame(results)
# Save the DataFrame to a CSV file

final_results.to_csv('metrics_unsortedforwardCV.csv', index=False)


target_CHEMBL240
RF
logp
target_CHEMBL240
xgboost
logp
target_CHEMBL240
MLP
logp
target_CHEMBL260
RF
logp
target_CHEMBL260
xgboost
logp
target_CHEMBL260
MLP
logp
target_CHEMBL279
RF
logp
target_CHEMBL279
xgboost
logp
target_CHEMBL279
MLP
logp


In [6]:
final_results

Unnamed: 0,Target,Sorting_Var,Method,Testing Batch,RMSE,R2,evalt
0,target_CHEMBL240,,RF,1,0.739077,0.192708,unsorted_fcv
1,target_CHEMBL240,,RF,2,0.580073,0.335591,unsorted_fcv
2,target_CHEMBL240,,RF,3,0.597960,0.430669,unsorted_fcv
3,target_CHEMBL240,,RF,4,0.780805,0.609939,unsorted_fcv
4,target_CHEMBL240,,RF,5,0.730253,0.544975,unsorted_fcv
...,...,...,...,...,...,...,...
76,target_CHEMBL279,,MLP,5,0.829270,0.448045,unsorted_fcv
77,target_CHEMBL279,,MLP,6,0.727529,0.480440,unsorted_fcv
78,target_CHEMBL279,,MLP,7,0.698650,0.607201,unsorted_fcv
79,target_CHEMBL279,,MLP,8,0.746392,0.576676,unsorted_fcv


In [7]:
# DataFrame to store results
results = []

for target in targets:
    for method in methods:

        print(target)
        print(method)
        print(var)

        df = pd.read_csv(f"../{target}-1_results/cv_results_{method}_testing batches.csv")
        # Group by 'testing batch' and calculate RMSE and R^2 for each group
        for batch, group in df.groupby('testing batch'):
            rmse = np.sqrt(mean_squared_error(group['True Values'], group['Predictions']))
            r_squared = r2_score(group['True Values'], group['Predictions'])

            # Collect results in the DataFrame
            results.append({
                'Target': target,
                'Sorting_Var': "None",
                'Method': method,
                'Testing Batch': batch,
                'RMSE': rmse,
                'R2': r_squared,
                'evalt': 'cv'
            })


# Convert the list of dictionaries to a DataFrame
final_results = pd.DataFrame(results)
# Save the DataFrame to a CSV file

final_results.to_csv('metrics_CV.csv', index=False)


target_CHEMBL240
RF
logp
target_CHEMBL240
xgboost
logp
target_CHEMBL240
MLP
logp
target_CHEMBL260
RF
logp
target_CHEMBL260
xgboost
logp
target_CHEMBL260
MLP
logp
target_CHEMBL279
RF
logp
target_CHEMBL279
xgboost
logp
target_CHEMBL279
MLP
logp


In [8]:
final_results

Unnamed: 0,Target,Sorting_Var,Method,Testing Batch,RMSE,R2,evalt
0,target_CHEMBL240,,RF,1,0.588950,0.609461,cv
1,target_CHEMBL240,,RF,2,0.492309,0.783136,cv
2,target_CHEMBL240,,RF,3,0.451102,0.753985,cv
3,target_CHEMBL240,,RF,4,0.494430,0.753029,cv
4,target_CHEMBL240,,RF,5,0.560429,0.789436,cv
...,...,...,...,...,...,...,...
85,target_CHEMBL279,,MLP,6,0.667522,0.597145,cv
86,target_CHEMBL279,,MLP,7,0.748181,0.497309,cv
87,target_CHEMBL279,,MLP,8,0.708776,0.527010,cv
88,target_CHEMBL279,,MLP,9,0.635980,0.635232,cv


In [9]:
# DataFrame to store results
results = []

for target in targets:
    for method in methods:

        print(target)
        print(method)
        print(var)

        df = pd.read_csv(f"../{target}-1_results/scaffold_cv_results_{method}_testing batches.csv")
        # Group by 'testing batch' and calculate RMSE and R^2 for each group
        for batch, group in df.groupby('testing batch'):
            rmse = np.sqrt(mean_squared_error(group['True Values'], group['Predictions']))
            r_squared = r2_score(group['True Values'], group['Predictions'])

            # Collect results in the DataFrame
            results.append({
                'Target': target,
                'Sorting_Var': "None",
                'Method': method,
                'Testing Batch': batch,
                'RMSE': rmse,
                'R2': r_squared,
                'evalt': 'scaffold_CV'
            })


# Convert the list of dictionaries to a DataFrame
final_results = pd.DataFrame(results)
# Save the DataFrame to a CSV file

final_results.to_csv('metrics_scaffold_CV.csv', index=False)


target_CHEMBL240
RF
logp
target_CHEMBL240
xgboost
logp
target_CHEMBL240
MLP
logp
target_CHEMBL260
RF
logp
target_CHEMBL260
xgboost
logp
target_CHEMBL260
MLP
logp
target_CHEMBL279
RF
logp
target_CHEMBL279
xgboost
logp
target_CHEMBL279
MLP
logp


In [10]:

metrics_sortedforwardCV = pd.read_csv("metrics_sortedforwardCV.csv")
metrics_unsortedforwardCV = pd.read_csv("metrics_unsortedforwardCV.csv")
metrics_CV = pd.read_csv("metrics_CV.csv")
metrics_scaffold_CV = pd.read_csv("metrics_scaffold_CV.csv")

# Concatenating the DataFrames
combined_metrics = pd.concat([metrics_sortedforwardCV, metrics_unsortedforwardCV, metrics_CV, metrics_scaffold_CV], axis=0)

# Optional: Reset index if you want a clean, sequential index in the new DataFrame
combined_metrics.reset_index(drop=True, inplace=True)

# Save the concatenated DataFrame to a CSV file
combined_metrics.to_csv('combined_metrics.csv', index=False)

# Display the DataFrame to verify the contents
combined_metrics


Unnamed: 0,Target,Sorting_Var,Method,Testing Batch,RMSE,R2,evalt
0,target_CHEMBL240,logp,RF,1,0.542261,0.330080,sorted_fcv
1,target_CHEMBL240,logp,RF,2,0.619225,-0.173668,sorted_fcv
2,target_CHEMBL240,logp,RF,3,1.110581,-0.120046,sorted_fcv
3,target_CHEMBL240,logp,RF,4,0.751111,0.457861,sorted_fcv
4,target_CHEMBL240,logp,RF,5,0.776810,0.595403,sorted_fcv
...,...,...,...,...,...,...,...
337,target_CHEMBL279,,MLP,6,0.909121,0.461547,scaffold_CV
338,target_CHEMBL279,,MLP,7,0.890862,0.354422,scaffold_CV
339,target_CHEMBL279,,MLP,8,0.879394,0.318483,scaffold_CV
340,target_CHEMBL279,,MLP,9,0.818693,0.550499,scaffold_CV
