In [30]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

In [31]:
targets = ["target_CHEMBL240"]

methods = ["RF"]



sorting_vars = ["logp"] #, "pchembl"]


In [32]:
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [33]:

# DataFrame to store all results
results_df = pd.DataFrame(columns=['Standardized_SMILES', 'Similarity'])

# Loop through each configuration
for target in targets:
    for var in sorting_vars:
        for method in methods:
            print("sortedFCV")
            print(target)
            print(method)
            print(var)

            # Load the DataFrame from a CSV file
            test_df = pd.read_csv(f"../{target}-1_results/{var}_sorted_fcv_results_{method}_testing batches.csv")
            train_df = pd.read_csv(f"../{target}-1_results/{var}_sorted_fcv_results_{method}_training batches.csv")
            
            # Precompute fingerprints for training set
            train_df['Fingerprint'] = train_df['Standardized_SMILES'].parallel_apply(
                lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
            )

            # Function to find max similarity with batch optimization
            def max_tanimoto(test_smile, train_fps):
                test_mol = Chem.MolFromSmiles(test_smile)
                test_fp = AllChem.GetMorganFingerprintAsBitVect(test_mol, radius=2)
                return max(DataStructs.TanimotoSimilarity(test_fp, train_fp) for train_fp in train_fps)

            # Analyze each batch
            for batch in test_df['testing batch'].unique():
                test_batch = test_df[test_df['testing batch'] == batch]
                train_batch = train_df[train_df['training batch'] == batch]
                train_fps = train_batch['Fingerprint'].tolist()

                low_similarity_smiles = []
                errors = []

                for idx, test_row in test_batch.iterrows():
                    test_SMILES = test_row['Standardized_SMILES']
                    max_similarity = max_tanimoto(test_SMILES, train_fps)    
                    results_df = results_df.append({
                    'Standardized_SMILES': test_SMILES,
                    'Similarity': max_similarity}, ignore_index=True)

                

# Display the results DataFrame
results_df = pd.merge(test_df, results_df)
results_df

sortedFCV
target_CHEMBL240
RF
logp


Unnamed: 0,testing batch,Standardized_SMILES,True Values,Predictions,Similarity
0,1,COc1cc(C(O)(CCN(C)C)C(c2cc3cc(Br)ccc3nc2OC)c2c...,5.66,5.266364,0.893939
1,1,CN1CCC(COCc2cc(C(F)(F)F)cc(C3(F)CC3)n2)(c2ccc(...,5.37,4.853636,0.733333
2,1,CNC(C)c1cc(C)ccc1Oc1ccc(Cl)c(Cl)c1,5.57,5.618182,0.688889
3,1,O=C(NC(C1CC1)C1C2CC(n3cnc4cc(F)c(F)cc43)CC21)c...,6.25,6.266364,0.611940
4,1,CC(C)(NC(=O)c1ccc(Cl)cc1)C1C2CC(n3cnc4cc(F)c(F...,5.80,5.980000,0.542857
...,...,...,...,...,...
1131,9,N#Cc1ccc(Cn2cncc2CNC2CCN(C(=O)c3ccc[nH]c3=O)C2...,4.30,5.049600,0.869565
1132,9,N#Cc1ccc(=NC(=O)C(COCCO)Oc2ncnc3c2cnn3-c2ncccc...,4.30,4.752400,0.674699
1133,9,CNS(=O)(=O)CCN(CCc1ccc(Cl)cc1)C1CCN(c2nc(=N)[n...,4.80,5.300000,0.700000
1134,9,CS(=O)(=O)NCCN(CCc1ccc(Cl)cc1)C1CCN(c2nc(=N)[n...,5.14,5.300000,0.700000


In [35]:

# DataFrame to store all results
results_df = pd.DataFrame(columns=['Standardized_SMILES', 'Similarity'])
# Loop through each configuration
for target in targets:
    for method in methods:
        print("unsortedFCV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        test_df = pd.read_csv(f"../{target}-1_results/unsorted_fcv_results_{method}_testing batches.csv")
        train_df = pd.read_csv(f"../{target}-1_results/unsorted_fcv_results_{method}_training batches.csv")


        # Precompute fingerprints for training set
        train_df['Fingerprint'] = train_df['Standardized_SMILES'].parallel_apply(
            lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
        )

        # Function to find max similarity with batch optimization
        def max_tanimoto(test_smile, train_fps):
            test_mol = Chem.MolFromSmiles(test_smile)
            test_fp = AllChem.GetMorganFingerprintAsBitVect(test_mol, radius=2)
            return max(DataStructs.TanimotoSimilarity(test_fp, train_fp) for train_fp in train_fps)

        # Analyze each batch
        for batch in test_df['testing batch'].unique():
            test_batch = test_df[test_df['testing batch'] == batch]
            train_batch = train_df[train_df['training batch'] == batch]
            train_fps = train_batch['Fingerprint'].tolist()

            low_similarity_smiles = []
            errors = []

            for idx, test_row in test_batch.iterrows():
                test_SMILES = test_row['Standardized_SMILES']
                max_similarity = max_tanimoto(test_SMILES, train_fps)    
                results_df = results_df.append({
                'Standardized_SMILES': test_SMILES,
                'Similarity': max_similarity}, ignore_index=True)



# Display the results DataFrame
results_df = pd.merge(test_df, results_df)
results_df

unsortedFCV
target_CHEMBL240
RF
logp


Unnamed: 0,testing batch,Standardized_SMILES,True Values,Predictions,Similarity
0,1,CC1CN(CCc2ccc([N+](=O)[O-])cc2)CCN1CCc1ccc([N+...,9.12,7.581818,0.551020
1,1,CC1c2ccc(C#N)c(c2)Oc2cccc(c2)CN2CCC(NCc3cncn31...,4.89,6.087273,0.194915
2,1,CC=Cc1cc(C(F)(F)F)cc(COCC2(c3ccc(F)cc3)CCN(C)C...,4.89,4.814545,0.786885
3,1,CCC(C)C(C)CN1CCC(CNC(=O)c2cc(Cl)cc(Cl)c2)CC1,6.05,5.487273,0.705882
4,1,CCC(CC)CN=c1c(C(N)=O)c(C)[nH]c2ccc(-c3cccnc3)cc12,6.17,5.305000,0.435897
...,...,...,...,...,...
1131,9,c1ccc2c3c([nH]c2c1)C(C1CCOCC1)NC(c1nc(-c2cscn2...,5.36,6.345600,0.705882
1132,9,c1ccc2c3c([nH]c2c1)C(C1CCOCC1)NC(c1nc(C24CC5CC...,6.30,6.016800,0.676923
1133,9,c1ccc2c3c([nH]c2c1)C(C1CCOCC1)NC(c1nc(C2CCCCC2...,6.87,6.198800,0.698413
1134,9,c1cncc(-c2c[nH]c(C3Cc4c([nH]c5ccccc45)C(C4CCCC...,6.72,6.631200,0.628571


In [36]:

# DataFrame to store all results
results_df = pd.DataFrame(columns=['Standardized_SMILES', 'Similarity'])
# Loop through each configuration
for target in targets:
    for method in methods:
        print("CV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        test_df = pd.read_csv(f"../{target}-1_results/cv_results_{method}_testing batches.csv")
        train_df = pd.read_csv(f"../{target}-1_results/cv_results_{method}_training batches.csv")

                    
        # Precompute fingerprints for training set
        train_df['Fingerprint'] = train_df['Standardized_SMILES'].parallel_apply(
            lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
        )

        # Function to find max similarity with batch optimization
        def max_tanimoto(test_smile, train_fps):
            test_mol = Chem.MolFromSmiles(test_smile)
            test_fp = AllChem.GetMorganFingerprintAsBitVect(test_mol, radius=2)
            return max(DataStructs.TanimotoSimilarity(test_fp, train_fp) for train_fp in train_fps)

        # Analyze each batch
        for batch in test_df['testing batch'].unique():
            test_batch = test_df[test_df['testing batch'] == batch]
            train_batch = train_df[train_df['training batch'] == batch]
            train_fps = train_batch['Fingerprint'].tolist()

            low_similarity_smiles = []
            errors = []

            for idx, test_row in test_batch.iterrows():
                test_SMILES = test_row['Standardized_SMILES']
                max_similarity = max_tanimoto(test_SMILES, train_fps)    
                results_df = results_df.append({
                'Standardized_SMILES': test_SMILES,
                'Similarity': max_similarity}, ignore_index=True)



# Display the results DataFrame
results_df = pd.merge(test_df, results_df)
results_df

CV
target_CHEMBL240
RF
logp


Unnamed: 0,testing batch,Standardized_SMILES,True Values,Predictions,Similarity
0,1,O=C1N(CCN2Cc3ccccc3C2)CCN1Cc1cccc(C(F)(F)F)c1,5.20,5.7272,0.693878
1,1,Cc1oc(-c2ccccc2)nc1C(=O)N1CCC(Oc2ccc(CN3CCCC3)...,5.52,5.0024,0.703125
2,1,COc1ccc(-c2nnc(C(=O)N3CC(Oc4ccc(CN5CCC(C)(O)C5...,4.58,4.9432,0.731343
3,1,CNC(C)c1cc(C(F)(F)F)ccc1Oc1ccc(Cl)c(Cl)c1,5.52,5.5756,0.688889
4,1,COCCN(CCc1ccc(Cl)cc1)C1CCN(c2nc(=N)[nH][nH]2)CC1,5.55,5.1520,0.767857
...,...,...,...,...,...
1257,10,COc1cc(N2CCN(CCO)CC2)ccc1N=c1nc(-c2cnc3ccccn23...,5.43,5.1562,0.820513
1258,10,Cc1nnc(C2(c3cnn(C)c3)NC(c3nc(-c4ccc(F)cn4)c[nH...,5.50,4.9346,0.860759
1259,10,Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3ccc(C(C)(C)C)cc3...,6.20,6.1452,0.815385
1260,10,NC(=O)c1cccc(NC2CC3CCC(C2)N3Cc2ccccc2)c1,5.96,5.9392,0.666667


In [37]:

# DataFrame to store all results
results_df = pd.DataFrame(columns=['Standardized_SMILES', 'Similarity'])
# Loop through each configuration
for target in targets:
    for method in methods:
        print("CV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        test_df = pd.read_csv(f"../{target}-1_results/scaffold_cv_results_{method}_testing batches.csv")
        train_df = pd.read_csv(f"../{target}-1_results/scaffold_cv_results_{method}_training batches.csv")

                    
        # Precompute fingerprints for training set
        train_df['Fingerprint'] = train_df['Standardized_SMILES'].parallel_apply(
            lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
        )

        # Function to find max similarity with batch optimization
        def max_tanimoto(test_smile, train_fps):
            test_mol = Chem.MolFromSmiles(test_smile)
            test_fp = AllChem.GetMorganFingerprintAsBitVect(test_mol, radius=2)
            return max(DataStructs.TanimotoSimilarity(test_fp, train_fp) for train_fp in train_fps)

        # Analyze each batch
        for batch in test_df['testing batch'].unique():
            test_batch = test_df[test_df['testing batch'] == batch]
            train_batch = train_df[train_df['training batch'] == batch]
            train_fps = train_batch['Fingerprint'].tolist()

            low_similarity_smiles = []
            errors = []

            for idx, test_row in test_batch.iterrows():
                test_SMILES = test_row['Standardized_SMILES']
                max_similarity = max_tanimoto(test_SMILES, train_fps)    
                results_df = results_df.append({
                'Standardized_SMILES': test_SMILES,
                'Similarity': max_similarity}, ignore_index=True)



# Display the results DataFrame
results_df = pd.merge(test_df, results_df)
results_df

CV
target_CHEMBL240
RF
logp


Unnamed: 0,testing batch,Standardized_SMILES,True Values,Predictions,Similarity
0,1,CC(C)(O)CCOc1ccc2ncc(F)c(CCC34CCC(NCc5ccc6c(n5...,5.16,5.4262,0.790698
1,1,CC(C)CCOc1ccc2ncc(F)c(CCC34CCC(NCc5ccc6c(n5)NC...,5.46,5.2590,0.788235
2,1,CCOC(=O)C(CC12CCC(NCc3ccc4c(n3)NC(=O)CO4)(CC1)...,5.08,5.6268,0.719101
3,1,CCOC(=O)C(F)(F)CCOc1ccc2ncc(F)c(CCC34CCC(NCc5c...,4.59,5.4488,0.739130
4,1,CCOC(=O)CCCOc1ccc2ncc(F)c(CCC34CCC(NCc5ccc6c(n...,6.00,6.5384,0.755556
...,...,...,...,...,...
1257,10,Cn1c(SCCCN2CCC3CC3(c3ccc(C(F)(F)F)cc3)CC2)nnc1...,5.30,5.9700,0.706667
1258,10,Cn1c(SCCCN2CCc3ccc4oc(C(F)(F)F)nc4c3CC2)nnc1-c...,6.60,5.7000,0.800000
1259,10,Cn1cc(-c2ccc(OC3CCN(C(=O)Cc4ccc(OC(F)(F)F)cc4)...,5.17,5.2496,0.573171
1260,10,Cn1cc(-c2ccc(OC3CCN(C(=O)Cc4ccc(OC(F)(F)F)cc4)...,5.35,4.9296,0.763889
