## Extract molecular descriptors using mordred library

[https://github.com/mordred-descriptor/mordred](https://github.com/mordred-descriptor/mordred)

In [1]:
import rdkit
from rdkit import Chem
from mordred import Calculator, descriptors
import pandas as pd
import numpy as np

n = len(Calculator(descriptors, ignore_3D=False))
n_2d = len(Calculator(descriptors, ignore_3D=True))
n_3d = n - n_2d
print("Total number of descriptors: {}".format(n))
print("Total number of 2D descriptors: {}".format(n_2d))
print("Total number of 3D descriptors: {}".format(n_3d))

Total number of descriptors: 1826
Total number of 2D descriptors: 1613
Total number of 3D descriptors: 213


In [127]:
df = pd.read_csv("../data/smile.csv")
calc = Calculator(descriptors, ignore_3D=False)
mols = [Chem.MolFromSmiles(smi) for smi in df["SMILE"]]
mask = [mol is not None for mol in mols]
df = df[mask].reset_index()
desc = calc.pandas([mol for mol in mols if mol is not None]).reset_index()
df = pd.concat([df, desc], axis=1)
df.loc[:, "label"] = df["Human PXR Activator or not?"].apply(lambda l: 1 if l == "Y" else 0)
features = df.columns[8:-1]
features = [feat for feat in features if df[feat].dtype != "O"]
label = "label"

RDKit ERROR: [22:28:10] SMILES Parse Error: extra open parentheses for input: 'O=C1C(OC(C(C(O[H])([H])C([H])4°[H])([H])O[H])([H])OC4([H])C([H])([H])O[H])=C(C3=C([H])C(O[H])=C(O[H])C([H])=C3[H])OC2=C1C(O[H])=C([H])C(O[H])=C2[H]'
RDKit ERROR: [22:28:10] SMILES Parse Error: extra open parentheses for input: 'O=C1C(OC3([H])OC(C(O[H])([H])C([H])([H])O[H])([H])C(O[H])([H])C([H])3°[H])=C(C4=C([H])C(O[H])=C(O[H])C([H])=C4[H])OC2=C1C(O[H])=C([H])C(O[H])=C2[H]'
RDKit ERROR: [22:28:10] SMILES Parse Error: extra open parentheses for input: 'O=C1C(OC(OC(C([H])([H])[H])([H])C(O[H])([H])C([H])3°[H])([H])C3([H])O[H])=C(C4=C([H])C(O[H])=C(O[H])C([H])=C4[H])OC2=C1C(O[H])=C([H])C(O[H])=C2[H]'
RDKit ERROR: [22:28:10] SMILES Parse Error: syntax error while parsing: C1CC(CL)CCC1C(N4CCNC4)(C2CCCCC2)C3CCCCC3
RDKit ERROR: [22:28:10] SMILES Parse Error: Failed parsing SMILES 'C1CC(CL)CCC1C(N4CCNC4)(C2CCCCC2)C3CCCCC3' for input: 'C1CC(CL)CCC1C(N4CCNC4)(C2CCCCC2)C3CCCCC3'
RDKit ERROR: [22:28:10] SMILES Parse Erro

 10%|▉         | 32/322 [00:02<00:46,  6.26it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 47%|████▋     | 152/322 [00:07<00:09, 18.20it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 322/322 [00:13<00:00, 23.12it/s]


In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, stratify=df["label"], test_size=0.2, random_state=2020)
X, y = train[features], train[label]
X_test, y_test = test[features], test[label]

In [170]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("classifier", SVC(kernel="rbf"))
])
params = {
    "classifier__C": [2, 4, 8],
    "classifier__gamma": [0.01, 0.02, 0.04]
}
gs1 = GridSearchCV(pipeline, param_grid=params, cv=5, refit=True, 
                  verbose=5, n_jobs=3, scoring="f1")
gs1.fit(X, y)
print("SVM 5 fold cross validation F1 score: {:.3f}".format(gs1.best_score_))
print(classification_report(y_test, gs1.predict(X_test)))

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    1.0s
[Parallel(n_jobs=3)]: Done  45 out of  45 | elapsed:    3.3s finished


SVM 5 fold cross validation F1 score: 0.806
              precision    recall  f1-score   support

           0       0.72      0.78      0.75        27
           1       0.83      0.79      0.81        38

    accuracy                           0.78        65
   macro avg       0.78      0.78      0.78        65
weighted avg       0.79      0.78      0.79        65



In [177]:
pipeline = Pipeline([
    ("classifier", RandomForestClassifier(100))
])
params = {
    "classifier__max_depth": [4, 8, 16, None],
    "classifier__min_samples_split": [3, 5, 7, 11],
    "classifier__criterion": ["gini", "entropy"]
}
gs2 = GridSearchCV(pipeline, param_grid=params, cv=5, refit=True, 
                   verbose=5, n_jobs=3, scoring="f1")
gs2.fit(X, y)
print("Random Forest 5 fold cross validation F1 score: {:.3f}".format(gs2.best_score_))
print(classification_report(y_test, gs2.predict(X_test)))

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    1.9s
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed:    7.1s
[Parallel(n_jobs=3)]: Done 160 out of 160 | elapsed:   17.8s finished


Random Forest 5 fold cross validation F1 score: 0.789
              precision    recall  f1-score   support

           0       0.74      0.63      0.68        27
           1       0.76      0.84      0.80        38

    accuracy                           0.75        65
   macro avg       0.75      0.74      0.74        65
weighted avg       0.75      0.75      0.75        65

