In [1]:
import pandas 
import numpy as np    
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def valid_pka_or_None(val):
    s_val = str(val)
    try:
        return float(val)
    except ValueError as e:
        if "~" in s_val or "<" in s_val or ">" in s_val or "temp" in s_val or "not_stated" in s_val:
            return None
        else:
            to_range = [" to ","-"]
            for r in to_range:
                if r in s_val:
                    return np.mean([float(f) for f in s_val.split(r)])
            return None            

np.bool8 = np.bool
from plotly import express as px
renderer="iframe"
pKa = pandas.read_csv("../lib/Dissociation-Constants/iupac_high-confidence_v2_2.csv")
invalid_temps = ["Neutral molecule unstable", "not_stated", "c", "not",
                 "Not stated", "Not given",
                 "Few details", "not stated", "not_stated "]
pKa["Temperature ('C)"] = \
    [float(str(t).replace("<", "")) if t not in invalid_temps else float("nan")
     for t in pKa["T"]]
pKa["pka_value"] = [ valid_pka_or_None(val) 
                     for val in pKa["pka_value"]]
pKa["Degrees from 20 ('C)"] = np.abs(pKa["Temperature ('C)"] - 20)
pKa1_temp = pKa[pKa["pka_type"].isin(["pKa1"]) & pKa["Temperature ('C)"].between(15, 25)]
pKa1_median = pKa1_temp[["InChI","pka_value"]].groupby("InChI").median().reset_index()
pKa1_median["mol"] = pKa1_median["InChI"].transform(Chem.MolFromInchi)
pKa1_median.dropna(subset="mol",inplace=True,ignore_index=True)
pKa1_median.dropna(subset="pka_value",inplace=True,ignore_index=True)

In [2]:
def fit(mols,pka,radius = 2,fingerprint_size=512,n_jobs=None,params=None,
        generator=rdFingerprintGenerator.GetMorganGenerator):
    if params is None:
        params = { 'max_depth': [1,2,3, 4],'n_estimators': [2,10,50,100,200]}
    if generator == rdFingerprintGenerator.GetRDKitFPGenerator:
        fp_generator = generator(maxPath=2*radius,fpSize=fingerprint_size)
    elif generator == rdFingerprintGenerator.GetAtomPairGenerator:
        fp_generator = generator(maxDistance=2*radius,fpSize=fingerprint_size)
    elif generator == rdFingerprintGenerator.GetTopologicalTorsionGenerator:
        fp_generator = generator(torsionAtomCount=radius,fpSize=fingerprint_size)
    else:
        fp_generator = generator(radius=radius,fpSize=fingerprint_size)
    X = np.array([list(e) for e in mols.transform(fp_generator.GetFingerprint)],dtype=bool)
    X_train, X_test, y_train, y_test = train_test_split(X, pka, random_state=42)
    # Use "hist" for constructing the trees, with early stopping enabled.
    model = xgb.XGBRegressor()
    grid = GridSearchCV(estimator=model, 
                        param_grid=params,
                        scoring='r2', n_jobs=n_jobs,
                        verbose=False,return_train_score=True)
    grid.fit(X=X_train, y=y_train)
    return grid , X_train, X_test, y_train, y_test 

def flatten_errors(grid):
    df_to_cat = []
    for lab, y, y_error in [["test","mean_test_score","std_test_score"],
                            ["train","mean_train_score","std_train_score"]]:
        df_y = pandas.concat([pandas.DataFrame(grid.cv_results_["params"]),
                              pandas.DataFrame(grid.cv_results_[y], columns=["Score"]),
                             pandas.DataFrame(grid.cv_results_[y_error], columns=["Score error"])],axis=1)
        df_y["Set"] = lab
        df_to_cat.append(df_y)
    df_cat = pandas.concat(df_to_cat)
    return df_cat

In [3]:
pka = pKa1_median["pka_value"].to_numpy(dtype=float)
mols = pKa1_median["mol"]
fp_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=256)
X = np.array([list(e) for e in mols.transform(fp_generator.GetFingerprint)],dtype=bool)
#distance_matrix = ClusterMols.GetDistanceMatrix(fingerprints, metric=rdkit.DataStructs.DiceSimilarity)

In [None]:
df_fingerprints = []
generators = [
    ['ttgen', rdFingerprintGenerator.GetTopologicalTorsionGenerator],
    ['apgen', rdFingerprintGenerator.GetAtomPairGenerator],
    ["mgngen",rdFingerprintGenerator.GetMorganGenerator],
    ['rdkgen',rdFingerprintGenerator.GetRDKitFPGenerator],
]
for label,generator in tqdm(generators):
    for fingerprint_size in tqdm([128,512,1024,2048]):
        grid, _, _, _, _  = fit(mols,pka,radius = 2,fingerprint_size=fingerprint_size,params=None,
                               n_jobs=-2,generator=generator)
        df_errors = flatten_errors(grid)
        df_errors["bits"] = fingerprint_size
        df_errors["fp_type"] = label
        errors_grid = [df_errors,grid]
        df_fingerprints.append(errors_grid)

  0%|                                                                                                                                     | 0/4 [00:00<?, ?it/s]
  _data = np.array(data, dtype=dtype, copy=copy,

[A%|███████████████████████████████▎                                                                                             | 1/4 [00:09<00:28,  9.41s/it]
[A%|██████████████████████████████████████████████████████████████▌                                                              | 2/4 [00:30<00:32, 16.37s/it]
[A%|█████████████████████████████████████████████████████████████████████████████████████████████▊                               | 3/4 [01:25<00:33, 33.84s/it]

In [None]:
df_cat = pandas.concat([d[0] for d in df_fingerprints])
px.line(df_cat,y="Score",error_y="Score error",line_dash="bits",facet_row="fp_type",
        x="n_estimators",color="Set",facet_col="max_depth").show(renderer=renderer)