In [1]:
from glob import glob
from lightgbm import LGBMRegressor
import numpy as np
import pandas as pd
import useful_rdkit_utils as uru
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Read the training and test data

In [2]:
import warnings
from rdkit import RDLogger

warnings.simplefilter("ignore", category=DeprecationWarning)
warnings.filterwarnings('ignore')
RDLogger.DisableLog('rdApp.warning')

In [3]:
def read_input_data(dirname, prefix):
    idx = int(dirname.replace("data/Sol_repeated",""))
    # read the datafiles
    train = pd.read_csv(f"{dirname}/{prefix}_train_{idx:03d}.csv")
    val = pd.read_csv(f"{dirname}/{prefix}_val_{idx:03d}.csv")
    test = pd.read_csv(f"{dirname}/{prefix}_test_{idx:03d}.csv")
    train = pd.concat([train, val])
    train = train.dropna(subset=["Sol"])
    test = test.dropna(subset=["Sol"])
    print(len(train),len(val),len(test))
    train['fp'] = train.SMILES.apply(uru.smi2numpy_fp)
    test['fp'] = test.SMILES.apply(uru.smi2numpy_fp)
    return train, test

Build the LightGBM model

In [4]:
def build_model(train, test):
    lgbm = LGBMRegressor(verbose=-1)
    lgbm.fit(np.stack(train.fp), train.Sol)
    pred = lgbm.predict(np.stack(test.fp))
    mae = mean_absolute_error(test.Sol, pred)
    mse = mean_squared_error(test.Sol, pred)
    r2 = r2_score(test.Sol, pred)
    return mae, mse, r2, pred

Build models and store the data

In [5]:
df_list = []
result_list = []
for dirname in sorted(glob("data/Sol_repeated0*")):
    idx = int(dirname.replace("data/Sol_repeated",""))
    for prefix in ["random", "scaffold"]:
        train, test = read_input_data(dirname, prefix)
        mae, mse, r2, pred = build_model(train, test)
        test['method'] = 'lightGBM'
        test['Sol_pred'] = pred
        test['cv_cycle'] = idx
        test['split'] = prefix
        df_list.append(test)
        print(f"{prefix} {dirname} MAE: {mae:.2f} MSE: {mse:.2f} R2: {r2:.2f}")
        result_list.append([prefix, dirname, "ST", mae, mse, r2])

1738 174 435
random data/Sol_repeated_2_000 MAE: 0.42 MSE: 0.35 R2: 0.37
1734 174 439
scaffold data/Sol_repeated_2_000 MAE: 0.45 MSE: 0.39 R2: 0.31
1738 174 435
random data/Sol_repeated_2_001 MAE: 0.42 MSE: 0.35 R2: 0.24
1750 175 423
scaffold data/Sol_repeated_2_001 MAE: 0.42 MSE: 0.33 R2: 0.24
1738 174 435
random data/Sol_repeated_2_002 MAE: 0.42 MSE: 0.35 R2: 0.28
1747 175 426
scaffold data/Sol_repeated_2_002 MAE: 0.42 MSE: 0.34 R2: 0.30
1739 174 434
random data/Sol_repeated_2_003 MAE: 0.42 MSE: 0.34 R2: 0.32
1742 175 431
scaffold data/Sol_repeated_2_003 MAE: 0.46 MSE: 0.40 R2: 0.31
1739 174 434
random data/Sol_repeated_2_004 MAE: 0.43 MSE: 0.35 R2: 0.32
1719 172 454
scaffold data/Sol_repeated_2_004 MAE: 0.40 MSE: 0.31 R2: 0.28
1738 174 435
random data/Sol_repeated_2_005 MAE: 0.42 MSE: 0.34 R2: 0.39
1719 172 454
scaffold data/Sol_repeated_2_005 MAE: 0.42 MSE: 0.35 R2: 0.23
1738 174 435
random data/Sol_repeated_2_006 MAE: 0.40 MSE: 0.33 R2: 0.28
1745 175 428
scaffold data/Sol_repeated

Format the results into a dataframe

In [13]:
cols = ['cv_cycle', 'split', 'method', 'SMILES', 'Name', 'Sol', 'Sol_pred']
test[cols]

Unnamed: 0,cv_cycle,split,method,SMILES,Name,Sol,Sol_pred
0,4,scaffold,lightGBM,O=C(c1cccc(CNc2ccc3nccnc3n2)c1)N1CCCCC1,Mol461,2.209342,1.787624
1,4,scaffold,lightGBM,O=C(NCCc1ccccc1)c1cnccn1,Mol1029,2.414757,2.305143
2,4,scaffold,lightGBM,Cc1nnc(-c2ccc3occ(-c4ccc(S(C)=O)cc4)c3c2)o1,Mol76,2.121863,1.537778
3,4,scaffold,lightGBM,CC(C)C[C@@H](C(N)=O)n1ccnc1-c1cccc2ccccc12,Mol501,2.026184,2.259389
4,4,scaffold,lightGBM,c1ccc2c(c1)nnn2Cc1ccc2c(c1)OCO2,Mol3165,0.578703,1.740755
...,...,...,...,...,...,...,...
213,4,scaffold,lightGBM,Cc1c[nH]c(=O)n1-c1ccc(C(=O)Nc2ccc3ccccc3n2)cc1,Mol166,1.233822,1.095190
214,4,scaffold,lightGBM,O=c1n(CCc2ccncc2)nnn1-c1ccccc1,Mol309,1.936062,1.910196
215,4,scaffold,lightGBM,Cc1cc(C)nc(NC(=O)N(C)C2CCC2)c1,Mol958,2.247271,2.018385
216,4,scaffold,lightGBM,O=C(NCCc1csc(N2CCCC2)n1)N1CCSCC1,Mol1361,2.207115,2.122385


Write the individual predictions to disk

In [14]:
pd.concat(df_list)[cols].to_csv("lightgbm_repeated_regression_results.csv", index=False)

Write the summary statistics to disk

In [15]:
result_df = pd.DataFrame(result_list, columns=["split", "dataset", "task", "mae", "mse", "r2"])
result_df

Unnamed: 0,split,dataset,task,mae,mse,r2
0,random,data/Sol000,ST,0.40881,0.2955,0.403869
1,scaffold,data/Sol000,ST,0.461052,0.404571,0.253668
2,random,data/Sol001,ST,0.393893,0.290971,0.416523
3,scaffold,data/Sol001,ST,0.422455,0.33544,0.340121
4,random,data/Sol002,ST,0.420377,0.327475,0.325091
5,scaffold,data/Sol002,ST,0.486643,0.473878,0.237418
6,random,data/Sol003,ST,0.379179,0.292711,0.421693
7,scaffold,data/Sol003,ST,0.465607,0.409062,0.130006
8,random,data/Sol004,ST,0.405175,0.320023,0.341044
9,scaffold,data/Sol004,ST,0.403583,0.341611,0.336111


In [16]:
result_df.to_csv("lgbm_result.csv",index=False)