In [143]:
import pandas as pd
import numpy as np
from rdkit import Chem
from mordred import Calculator, descriptors
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [144]:
targets = ["target_CHEMBL240"]
methods = ["RF"]
sorting_vars = ["logp"] #, "pchembl"]

# Initialize a Mordred descriptor calculator
calculator = Calculator(descriptors, ignore_3D=True)

# Function to compute Mordred descriptors for a given SMILES string
def compute_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return calculator(mol)
    else:
        return pd.Series([np.nan] * len(calculator.descriptors))


In [155]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [161]:
# Loop through each configuration
for target in targets:
    for var in sorting_vars:
        for method in methods:
            print("sortedFCV")
            print(target)
            print(method)
            print(var)

            # Load the DataFrame from a CSV file
            test_df = pd.read_csv(f"../{target}-1_results/{var}_sorted_fcv_results_{method}_testing batches.csv")
            train_df = pd.read_csv(f"../{target}-1_results/{var}_sorted_fcv_results_{method}_training batches.csv")
            
            # Combine training and testing data
            train_df['Set'] = 'Train'
            train_df = train_df.rename({'training batch': 'batch'}, axis=1)
            test_df['Set'] = 'Test'
            test_df = test_df.rename({'testing batch': 'batch'}, axis=1)
            combined_df = pd.concat([train_df, test_df], ignore_index=True)

            # Compute Mordred descriptors for unique molecules
            unique_smiles = combined_df['Standardized_SMILES'].drop_duplicates()
            unique_descriptors = unique_smiles.parallel_apply(lambda x: pd.Series(compute_descriptors(x)))
            unique_descriptors_df = pd.DataFrame(unique_descriptors)
            unique_descriptors_df['Standardized_SMILES'] = unique_smiles.values

            #Drop columns with any NaN values
            unique_descriptors_df = unique_descriptors_df.dropna(axis=1)
            
            # Perform PCA
            pca = PCA(n_components=2, random_state=42)
            pca_result = pca.fit_transform(unique_descriptors_df.drop(columns=["Standardized_SMILES"]))

            # Create a DataFrame with PCA results
            pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
            pca_df['Standardized_SMILES'] = unique_smiles.values

            # Merge PCA results back to the combined dataframe to retain batch and set information
            merged_df = pd.merge(combined_df, pca_df, on='Standardized_SMILES', how='inner')

            # Get unique batches
            unique_batches = combined_df['batch'].dropna().unique()

            # Create a figure for subplots
            fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15), sharex=True, sharey=True)
            axes = axes.flatten()

            for i, batch in enumerate(unique_batches):
                if i >= len(axes):
                    break
                # Filter data for the current batch
                batch_df = merged_df[merged_df['batch'] == batch]

                # Plot PCA for the current batch
                sns.scatterplot(ax=axes[i], x='PC1', y='PC2', hue='Set', data=batch_df, palette=['blue', 'red'])
                axes[i].set_title(f'Batch {batch} - PCA')
                axes[i].set_xlabel('PC1')
                axes[i].set_ylabel('PC2')
                axes[i].legend()

            # Adjust layout
            plt.tight_layout()
            plt.suptitle(f'PCA of RDKit Descriptors for {target} - {var} - {method}', y=1.02)
            plt.show()

            # Print explained variance for the PCA
            explained_variance = pca.explained_variance_ratio_
            print(f'Explained variance by PC1: {explained_variance[0]:.2f}')
            print(f'Explained variance by PC2: {explained_variance[1]:.2f}')


sortedFCV
target_CHEMBL240
RF
logp


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=32), Label(value='0 / 32'))), HBox…

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [200]:
# Replace inf with NaN
unique_descriptors_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop columns with any NaN values
clean_descriptors_df = unique_descriptors_df.dropna(axis=1, how="any")

# Drop any rows with NaN values
clean_descriptors_df.dropna(inplace=True)

# Ensure all values are numeric and within float range
clean_descriptors_df = clean_descriptors_df.apply(pd.to_numeric, errors='coerce')
clean_descriptors_df = clean_descriptors_df[np.isfinite(clean_descriptors_df).all(1)]



In [195]:
clean_descriptors_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1604,1605,1606,1607,1608,1609,1610,1611,1612,Standardized_SMILES
0,32.906267,22.420246,0,1,52.893426,2.514690,4.939572,52.893426,1.290084,4.663397,...,94.868612,644.118077,9.335045,7192,73,228.0,274.0,12.618056,8.666667,
1,32.975216,23.102943,0,1,52.589656,2.516169,4.941106,52.589656,1.282675,4.665500,...,94.958602,644.118077,9.335045,6812,75,230.0,278.0,13.229167,8.611111,
2,29.112606,22.742260,0,1,48.493025,2.558257,5.116514,48.493025,1.310622,4.549418,...,74.234147,554.156890,8.149366,3672,67,200.0,241.0,11.284722,8.125000,
3,32.199160,22.163368,0,1,51.604236,2.514717,4.939607,51.604236,1.290106,4.641656,...,93.805191,618.177643,8.242369,6523,72,224.0,270.0,12.368056,8.416667,
4,27.093486,20.183323,0,1,44.474062,2.529114,4.995750,44.474062,1.308061,4.472323,...,84.685195,491.179811,7.674685,3363,58,186.0,223.0,10.444444,7.305556,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6801,24.127159,19.186567,0,1,40.545038,2.429126,4.771514,40.545038,1.307904,4.364755,...,82.505054,416.159689,8.159994,3118,47,162.0,191.0,9.361111,6.916667,
6802,26.139090,20.075766,0,0,44.363721,2.488759,4.875517,44.363721,1.304815,4.447297,...,84.107663,480.106129,9.413846,3700,53,174.0,204.0,10.111111,7.750000,
6803,22.376047,17.587479,0,1,37.006237,2.370919,4.708878,37.006237,1.276077,4.282599,...,77.694904,441.171372,7.739849,2495,40,148.0,168.0,9.840278,6.388889,
6804,22.534965,17.399988,0,1,36.405385,2.367790,4.698422,36.405385,1.255358,4.282328,...,77.627521,441.171372,7.739849,2541,38,148.0,166.0,9.840278,6.263889,


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [189]:
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(clean_descriptors_df.iloc[:, :50])
pca_result

array([[7126.87352273,  268.53974526],
       [1962.7996234 ,  435.77375378],
       [-779.99920862,  866.36844744],
       ...,
       [-880.79364603, -507.97762521],
       [-832.41975257, -546.2079911 ],
       [-822.40874226,  222.87986419]])

In [53]:

# DataFrame to store all results
results_df = pd.DataFrame(columns=['Standardized_SMILES', 'Similarity'])
# Loop through each configuration
for target in targets:
    for method in methods:
        print("unsortedFCV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        test_df = pd.read_csv(f"../{target}-1_results/unsorted_fcv_results_{method}_testing batches.csv")
        train_df = pd.read_csv(f"../{target}-1_results/unsorted_fcv_results_{method}_training batches.csv")


        # Precompute fingerprints for training set
        train_df['Fingerprint'] = train_df['Standardized_SMILES'].parallel_apply(
            lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
        )

        # Function to find max similarity with batch optimization
        def max_tanimoto(test_smile, train_fps):
            test_mol = Chem.MolFromSmiles(test_smile)
            test_fp = AllChem.GetMorganFingerprintAsBitVect(test_mol, radius=2)
            return max(DataStructs.TanimotoSimilarity(test_fp, train_fp) for train_fp in train_fps)

        # Analyze each batch
        for batch in test_df['testing batch'].unique():
            test_batch = test_df[test_df['testing batch'] == batch]
            train_batch = train_df[train_df['training batch'] == batch]
            train_fps = train_batch['Fingerprint'].tolist()

            low_similarity_smiles = []
            errors = []

            for idx, test_row in test_batch.iterrows():
                test_SMILES = test_row['Standardized_SMILES']
                max_similarity = max_tanimoto(test_SMILES, train_fps)    
                results_df = results_df.append({
                'Standardized_SMILES': test_SMILES,
                'Similarity': max_similarity}, ignore_index=True)



# Display the results DataFrame
results_df = pd.merge(test_df, results_df)
results_df

unsortedFCV
target_CHEMBL240
RF
logp


Unnamed: 0,testing batch,Standardized_SMILES,True Values,Predictions,Similarity
0,1,CC1CN(CCc2ccc([N+](=O)[O-])cc2)CCN1CCc1ccc([N+...,9.12,7.581818,0.551020
1,1,CC1c2ccc(C#N)c(c2)Oc2cccc(c2)CN2CCC(NCc3cncn31...,4.89,6.087273,0.194915
2,1,CC=Cc1cc(C(F)(F)F)cc(COCC2(c3ccc(F)cc3)CCN(C)C...,4.89,4.814545,0.786885
3,1,CCC(C)C(C)CN1CCC(CNC(=O)c2cc(Cl)cc(Cl)c2)CC1,6.05,5.487273,0.705882
4,1,CCC(CC)CN=c1c(C(N)=O)c(C)[nH]c2ccc(-c3cccnc3)cc12,6.17,5.305000,0.435897
...,...,...,...,...,...
1131,9,c1ccc2c3c([nH]c2c1)C(C1CCOCC1)NC(c1nc(-c2cscn2...,5.36,6.345600,0.705882
1132,9,c1ccc2c3c([nH]c2c1)C(C1CCOCC1)NC(c1nc(C24CC5CC...,6.30,6.016800,0.676923
1133,9,c1ccc2c3c([nH]c2c1)C(C1CCOCC1)NC(c1nc(C2CCCCC2...,6.87,6.198800,0.698413
1134,9,c1cncc(-c2c[nH]c(C3Cc4c([nH]c5ccccc45)C(C4CCCC...,6.72,6.631200,0.628571


In [36]:

# DataFrame to store all results
results_df = pd.DataFrame(columns=['Standardized_SMILES', 'Similarity'])
# Loop through each configuration
for target in targets:
    for method in methods:
        print("CV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        test_df = pd.read_csv(f"../{target}-1_results/cv_results_{method}_testing batches.csv")
        train_df = pd.read_csv(f"../{target}-1_results/cv_results_{method}_training batches.csv")

                    
        # Precompute fingerprints for training set
        train_df['Fingerprint'] = train_df['Standardized_SMILES'].parallel_apply(
            lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
        )

        # Function to find max similarity with batch optimization
        def max_tanimoto(test_smile, train_fps):
            test_mol = Chem.MolFromSmiles(test_smile)
            test_fp = AllChem.GetMorganFingerprintAsBitVect(test_mol, radius=2)
            return max(DataStructs.TanimotoSimilarity(test_fp, train_fp) for train_fp in train_fps)

        # Analyze each batch
        for batch in test_df['testing batch'].unique():
            test_batch = test_df[test_df['testing batch'] == batch]
            train_batch = train_df[train_df['training batch'] == batch]
            train_fps = train_batch['Fingerprint'].tolist()

            low_similarity_smiles = []
            errors = []

            for idx, test_row in test_batch.iterrows():
                test_SMILES = test_row['Standardized_SMILES']
                max_similarity = max_tanimoto(test_SMILES, train_fps)    
                results_df = results_df.append({
                'Standardized_SMILES': test_SMILES,
                'Similarity': max_similarity}, ignore_index=True)



# Display the results DataFrame
results_df = pd.merge(test_df, results_df)
results_df

CV
target_CHEMBL240
RF
logp


Unnamed: 0,testing batch,Standardized_SMILES,True Values,Predictions,Similarity
0,1,O=C1N(CCN2Cc3ccccc3C2)CCN1Cc1cccc(C(F)(F)F)c1,5.20,5.7272,0.693878
1,1,Cc1oc(-c2ccccc2)nc1C(=O)N1CCC(Oc2ccc(CN3CCCC3)...,5.52,5.0024,0.703125
2,1,COc1ccc(-c2nnc(C(=O)N3CC(Oc4ccc(CN5CCC(C)(O)C5...,4.58,4.9432,0.731343
3,1,CNC(C)c1cc(C(F)(F)F)ccc1Oc1ccc(Cl)c(Cl)c1,5.52,5.5756,0.688889
4,1,COCCN(CCc1ccc(Cl)cc1)C1CCN(c2nc(=N)[nH][nH]2)CC1,5.55,5.1520,0.767857
...,...,...,...,...,...
1257,10,COc1cc(N2CCN(CCO)CC2)ccc1N=c1nc(-c2cnc3ccccn23...,5.43,5.1562,0.820513
1258,10,Cc1nnc(C2(c3cnn(C)c3)NC(c3nc(-c4ccc(F)cn4)c[nH...,5.50,4.9346,0.860759
1259,10,Cc1ncoc1-c1nnc(SCCCN2CC3CC3(c3ccc(C(C)(C)C)cc3...,6.20,6.1452,0.815385
1260,10,NC(=O)c1cccc(NC2CC3CCC(C2)N3Cc2ccccc2)c1,5.96,5.9392,0.666667


In [37]:

# DataFrame to store all results
results_df = pd.DataFrame(columns=['Standardized_SMILES', 'Similarity'])
# Loop through each configuration
for target in targets:
    for method in methods:
        print("CV")
        print(target)
        print(method)
        print(var)

        # Load the DataFrame from a CSV file
        test_df = pd.read_csv(f"../{target}-1_results/scaffold_cv_results_{method}_testing batches.csv")
        train_df = pd.read_csv(f"../{target}-1_results/scaffold_cv_results_{method}_training batches.csv")

                    
        # Precompute fingerprints for training set
        train_df['Fingerprint'] = train_df['Standardized_SMILES'].parallel_apply(
            lambda x: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(x), radius=2)
        )

        # Function to find max similarity with batch optimization
        def max_tanimoto(test_smile, train_fps):
            test_mol = Chem.MolFromSmiles(test_smile)
            test_fp = AllChem.GetMorganFingerprintAsBitVect(test_mol, radius=2)
            return max(DataStructs.TanimotoSimilarity(test_fp, train_fp) for train_fp in train_fps)

        # Analyze each batch
        for batch in test_df['testing batch'].unique():
            test_batch = test_df[test_df['testing batch'] == batch]
            train_batch = train_df[train_df['training batch'] == batch]
            train_fps = train_batch['Fingerprint'].tolist()

            low_similarity_smiles = []
            errors = []

            for idx, test_row in test_batch.iterrows():
                test_SMILES = test_row['Standardized_SMILES']
                max_similarity = max_tanimoto(test_SMILES, train_fps)    
                results_df = results_df.append({
                'Standardized_SMILES': test_SMILES,
                'Similarity': max_similarity}, ignore_index=True)



# Display the results DataFrame
results_df = pd.merge(test_df, results_df)
results_df

CV
target_CHEMBL240
RF
logp


Unnamed: 0,testing batch,Standardized_SMILES,True Values,Predictions,Similarity
0,1,CC(C)(O)CCOc1ccc2ncc(F)c(CCC34CCC(NCc5ccc6c(n5...,5.16,5.4262,0.790698
1,1,CC(C)CCOc1ccc2ncc(F)c(CCC34CCC(NCc5ccc6c(n5)NC...,5.46,5.2590,0.788235
2,1,CCOC(=O)C(CC12CCC(NCc3ccc4c(n3)NC(=O)CO4)(CC1)...,5.08,5.6268,0.719101
3,1,CCOC(=O)C(F)(F)CCOc1ccc2ncc(F)c(CCC34CCC(NCc5c...,4.59,5.4488,0.739130
4,1,CCOC(=O)CCCOc1ccc2ncc(F)c(CCC34CCC(NCc5ccc6c(n...,6.00,6.5384,0.755556
...,...,...,...,...,...
1257,10,Cn1c(SCCCN2CCC3CC3(c3ccc(C(F)(F)F)cc3)CC2)nnc1...,5.30,5.9700,0.706667
1258,10,Cn1c(SCCCN2CCc3ccc4oc(C(F)(F)F)nc4c3CC2)nnc1-c...,6.60,5.7000,0.800000
1259,10,Cn1cc(-c2ccc(OC3CCN(C(=O)Cc4ccc(OC(F)(F)F)cc4)...,5.17,5.2496,0.573171
1260,10,Cn1cc(-c2ccc(OC3CCN(C(=O)Cc4ccc(OC(F)(F)F)cc4)...,5.35,4.9296,0.763889
