In [1]:
import pandas as pd
from rdkit import Chem

import numpy as np
from scipy.stats import spearmanr, pearsonr, kendalltau
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error, explained_variance_score, max_error

from rogi import RoughnessIndex

In [34]:
def calculate_percentage_metrics(list1, list2):
    arr1 = np.array(list1)
    arr2 = np.array(list2)
    
    # Avoid division by zero; handle cases where actual values are zero
    arr1_safe = np.where(arr1 == 0, np.finfo(float).eps, arr1)  # Replace 0 with a very small number
    
    # Calculate MAPE - Mean Absolute Percentage Error
    mape = np.mean(np.abs((arr1 - arr2) / arr1_safe)) * 100
    
    # Calculate wMAPE - Weighted Mean Absolute Percentage Error
    wmape = np.sum(np.abs(arr1 - arr2)) / np.sum(arr1_safe) * 100
    
    # Include previous metrics for completeness
    mae = mean_absolute_error(arr1, arr2)
    mse = mean_squared_error(arr1, arr2)
    rmse = np.sqrt(mse)
    r_squared = r2_score(arr1, arr2)
    median_ae = median_absolute_error(arr1, arr2)
    explained_variance = explained_variance_score(arr1, arr2)
    max_err = max_error(arr1, arr2)
    spearman_corr, _ = spearmanr(arr1, arr2)
    pearson_corr, _ = pearsonr(arr1, arr2)
    kendall_corr, _ = kendalltau(arr1, arr2)
    covariance_matrix = np.cov(arr1, arr2)
    covariance = covariance_matrix[0, 1]
    corrcoef_matrix = np.corrcoef(arr1, arr2)
    corrcoef = corrcoef_matrix[0, 1]

    
    # Print results
    print(f"MAPE (Mean Absolute Percentage Error): {mape}%")
    print(f"wMAPE (Weighted Mean Absolute Percentage Error): {wmape}%")
    print(f"MAE (Mean Absolute Error): {mae}")
    print(f"RMSE (Root Mean Squared Error): {rmse}")
    print(f"R-squared: {r_squared}")
    print(f"Median Absolute Error: {median_ae}")
    print(f"Explained Variance Score: {explained_variance}")
    print(f"Max Error: {max_err}")
    print(f"Spearman's rank correlation: {spearman_corr}")
    print(f"Pearson correlation coefficient: {pearson_corr}")
    print(f"Kendall Tau correlation: {kendall_corr}")
    print(f"Covariance: {covariance}")
    print(f"Correlation coefficient: {corrcoef}")

    # return mape, wmape, mae, rmse, r_squared, median_ae, explained_variance, max_err, spearman_corr, pearson_corr, kendall_corr, covariance, corrcoef

# Example usage
list1 = [1.0, 2.0, 3.0, 4.0, 5.0]
list2 = [1.1, 1.9, 3.1, 4.1, 4.9]

calculate_percentage_metrics(list1, list2)

MAPE (Mean Absolute Percentage Error): 4.566666666666666%
wMAPE (Weighted Mean Absolute Percentage Error): 3.3333333333333304%
MAE (Mean Absolute Error): 0.09999999999999991
RMSE (Root Mean Squared Error): 0.09999999999999991
R-squared: 0.995
Median Absolute Error: 0.10000000000000009
Explained Variance Score: 0.9952
Max Error: 0.10000000000000009
Spearman's rank correlation: 0.9999999999999999
Pearson correlation coefficient: 0.9977171289098261
Kendall Tau correlation: 0.9999999999999999
Covariance: 2.45
Correlation coefficient: 0.9977171289098259


In [5]:
opt_df = pd.read_csv('tests/data/lambda_dft_expt.csv')
opt_df

Unnamed: 0,smiles,peakwavs_max,energy_max_osc_nm
0,C1=C2B(Nc3ccccc31)OB1Nc3ccccc3C=C1C(c1ccccc1)=...,413.0,349.295775
1,Cc1ccc(C2=C(c3ccc(C)cc3)C3=Cc4ccccc4NB3OB3Nc4c...,412.0,350.480497
2,Cc1cccc(C2=C(c3cccc(C)c3)C3=Cc4ccccc4NB3OB3Nc4...,413.0,351.573575
3,COc1ccc(C2=C(c3ccc(OC)cc3)C3=Cc4ccccc4NB3OB3Nc...,413.0,351.673284
4,Fc1ccc(C2=C(c3ccc(F)cc3)C3=Cc4ccccc4NB3OB3Nc4c...,413.0,349.985888
...,...,...,...
4958,c1ccc(-[n+]2c3ccc4cccc5c6cccc7c8cccc9ccc2c(c98...,512.0,404.172099
4959,c1cc2ccc3[o+]c4ccc5cccc6c7cccc8c(c1)c2c3c(c87)...,573.0,436.312456
4960,c1cc2ccc3[s+]c4ccc5cccc6c7cccc8c(c1)c2c3c(c87)...,600.0,459.940653
4961,Cc1cc(C)c(C)c(-c2cc(-c3c(C)c(C)cc(C)c3C)c3ccc4...,364.0,319.422978


In [7]:
ri = RoughnessIndex(Y=opt_df.peakwavs_max, smiles=opt_df.smiles)
ri.compute_index()

Computing fingerprints...
Computing distance matrix...
Clustering...


0.021435714797693572

In [8]:
ri = RoughnessIndex(Y=opt_df.energy_max_osc_nm, smiles=opt_df.smiles)
ri.compute_index()

Computing fingerprints...
Computing distance matrix...
Clustering...


0.01232026104303921

In [9]:
ri = RoughnessIndex(Y=opt_df.peakwavs_max-opt_df.energy_max_osc_nm, smiles=opt_df.smiles)
ri.compute_index()

Computing fingerprints...
Computing distance matrix...
Clustering...


0.01063191321568116

In [10]:
ri = RoughnessIndex(Y=np.abs(opt_df.peakwavs_max-opt_df.energy_max_osc_nm), smiles=opt_df.smiles)
ri.compute_index()

Computing fingerprints...
Computing distance matrix...
Clustering...


0.016085917612476935

In [13]:
from rdkit import Chem
import rdkit
from rdkit.Chem import Descriptors

def getMolDescriptors(mol, missingVal=None):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

smiles = opt_df.smiles
mols = [Chem.MolFromSmiles(smi) for smi in smiles]

allDescrs = [getMolDescriptors(m) for m in mols]
desc_df = pd.DataFrame(allDescrs)
desc_df

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,6.836011,-0.298629,6.836011,0.298629,0.335128,448.143,425.967,448.191824,164,0,...,0,0,0,0,0,0,0,0,0,0
1,6.883465,-0.305265,6.883465,0.305265,0.297770,476.197,449.989,476.223124,176,0,...,0,0,0,0,0,0,0,0,0,0
2,6.896492,-0.306589,6.896492,0.306589,0.297770,476.197,449.989,476.223124,176,0,...,0,0,0,0,0,0,0,0,0,0
3,6.876428,-0.362118,6.876428,0.362118,0.300184,508.195,481.987,508.212953,188,0,...,0,0,0,0,0,0,0,0,0,0
4,14.032049,-0.473089,14.032049,0.305045,0.301586,484.123,463.963,484.172981,176,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958,2.471759,1.201266,2.471759,1.201266,0.140478,428.514,410.370,428.143376,154,0,...,0,0,0,0,0,0,0,0,0,0
4959,6.469729,0.973735,6.469729,0.973735,0.152885,353.400,340.296,353.096091,126,0,...,0,0,0,0,0,0,0,0,0,0
4960,2.319809,1.342488,2.319809,1.342488,0.142901,369.468,356.364,369.073248,126,0,...,0,0,0,0,0,0,0,0,0,0
4961,2.557983,1.336634,2.557983,1.336634,0.158205,731.080,672.616,730.453852,282,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# list which columns in desc_df have NaN values
print(desc_df.columns[desc_df.isna().any()].tolist())
print(len(desc_df.columns[desc_df.isna().any()].tolist()))

# drop columns with NaN values
desc_df = desc_df.dropna(axis=1)

['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW']
12


In [20]:
ri = RoughnessIndex(Y=opt_df.peakwavs_max-opt_df.energy_max_osc_nm, X=desc_df, metric='euclidean')
ri.compute_index()

Clustering...


0.038009938476772885

In [21]:
ri = RoughnessIndex(Y=opt_df.peakwavs_max-opt_df.energy_max_osc_nm, X=desc_df, metric='cosine')
ri.compute_index()

Clustering...


0.07311415536797049

In [23]:
pc_df = pd.read_csv('tests/data/pubchem_AID1445_sd_dr_0.1.csv')
pc_df

Unnamed: 0,smiles,SD,DR
0,O=C1N=c2ccc(Br)cc2=C1c1sc(=S)n(NS(=O)(=O)c2ccc...,99.86,6.000000
1,CCN1C(=O)C(=Cc2ccc(-c3ccc([N+](=O)[O-])cc3OC)o...,98.34,5.657577
2,Cc1cccc(C)c1OCc1nc2ccccc2c(=O)n1N=Cc1ccc(OCC(=...,97.76,6.000000
3,CC(=O)N1C(=O)C(=C2SC(=S)N(NS(=O)(=O)c3ccc(C)cc...,96.64,5.492144
4,CC(CCC(=O)O)C1CCC2C3C(O)CC4CC(O)CCC4(C)C3CCC12C,94.09,5.585027
...,...,...,...
21294,Cn1cc(C(=O)c2ccc(Cl)cc2Cl)cc1C(=O)NN=C(N)COc1c...,-21.65,
21295,Cc1c(C(=O)NNC(=S)Nc2cccc(Cl)c2)nnn1-c1ccccc1,-22.94,
21296,COc1ccccc1NC(=S)NNC(=O)c1ccc(Cl)cc1Cl,-23.82,
21297,Oc1c(C(Nc2ccccn2)c2cccnc2)cc(Cl)c2cccnc12,-24.05,


In [25]:
pc_df = pc_df.dropna()
pc_df

Unnamed: 0,smiles,SD,DR
0,O=C1N=c2ccc(Br)cc2=C1c1sc(=S)n(NS(=O)(=O)c2ccc...,99.86,6.000000
1,CCN1C(=O)C(=Cc2ccc(-c3ccc([N+](=O)[O-])cc3OC)o...,98.34,5.657577
2,Cc1cccc(C)c1OCc1nc2ccccc2c(=O)n1N=Cc1ccc(OCC(=...,97.76,6.000000
3,CC(=O)N1C(=O)C(=C2SC(=S)N(NS(=O)(=O)c3ccc(C)cc...,96.64,5.492144
4,CC(CCC(=O)O)C1CCC2C3C(O)CC4CC(O)CCC4(C)C3CCC12C,94.09,5.585027
...,...,...,...
697,O=C(O)c1cc(-c2ccc(C=NN3C(=O)C4C5C=CC(C6CC56)C4...,28.38,4.042345
698,O=C(CCN1CCOCC1)Nc1c(-c2ccccc2)c2cc(Br)ccc2[nH]...,28.25,4.027288
707,CCc1oc2ccccc2c1C(=O)c1cc(Br)c(O)c(Br)c1,27.60,4.202179
711,COCCCn1c(=N)c(C(=O)NCC2CCCO2)cc2c(=O)n3ccccc3nc21,27.26,4.220548


In [26]:
ri = RoughnessIndex(Y=pc_df.SD, smiles=pc_df.smiles)
ri.compute_index()

Computing fingerprints...
Computing distance matrix...
Clustering...


0.09124213042819868

In [27]:
ri = RoughnessIndex(Y=pc_df.DR, smiles=pc_df.smiles)
ri.compute_index()

Computing fingerprints...
Computing distance matrix...
Clustering...


0.054235890491178856

In [35]:
calculate_percentage_metrics(pc_df.SD, pc_df.DR)

MAPE (Mean Absolute Percentage Error): 89.37250980796692%
wMAPE (Weighted Mean Absolute Percentage Error): 90.1453602661744%
MAE (Mean Absolute Error): 40.675210544847715
RMSE (Root Mean Squared Error): 43.35649272310289
R-squared: -7.04291462484615
Median Absolute Error: 35.908046230395286
Explained Variance Score: 0.03596930886148708
Max Error: 93.86
Spearman's rank correlation: 0.7503540401616475
Pearson correlation coefficient: 0.7839961917865665
Kendall Tau correlation: 0.5631232547315521
Covariance: 4.273250483204777
Correlation coefficient: 0.7839961917865667


In [36]:
calculate_percentage_metrics(opt_df.peakwavs_max, opt_df.energy_max_osc_nm)

MAPE (Mean Absolute Percentage Error): 15.83795005552309%
wMAPE (Weighted Mean Absolute Percentage Error): 16.809480525459318%
MAE (Mean Absolute Error): 75.15328665192277
RMSE (Root Mean Squared Error): 94.21452198109924
R-squared: 0.03668773205848175
Median Absolute Error: 63.37383602089483
Explained Variance Score: 0.5834573211690091
Max Error: 995.1428571428571
Spearman's rank correlation: 0.8183146547209219
Pearson correlation coefficient: 0.7666716574313159
Kendall Tau correlation: 0.6522124289874525
Covariance: 4952.348938570619
Correlation coefficient: 0.7666716574313157
