# Introduction

This tutorial shows how to classify terpenes using classic as well as deep learning classification methods.

The terpenes data is a subset of the [COCONUT](https://coconut.naturalproducts.net) dataset (version March 2021), which is obtained by filtering COCONUT's ``chemicalSuperClass`` column to include only ``Lipids and lipid-like molecules``.

# Setup

In [1]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import napr
from napr.data import load_terpene
from napr.apps import Terpene
from napr.evaluation import eval_classification
from napr.utils import split_train_test, label_encode
from napr.hyperopt import find_best_models

RANDOM_STATE = 777

napr.__version__


'0.1.5'

# Data

In [2]:
# Download and load the terpene dataset.
# Note: the dataset, 'terpene-21.3.bz2', is saved by default to the current
# directory.
data = load_terpene(download=False)  # load_terpene() if downloaded already

data.head()


Unnamed: 0,_id,coconut_id,contains_sugar,heavy_atom_number,name,molecular_formula,molecular_weight,textTaxa,npl_noh_score,npl_score,...,weinerPathNumber,weinerPolarityNumber,zagrebIndex,topoPSA,tpsaEfficiency,iupac_name,chemicalClass,chemicalSubClass,chemicalSuperClass,directParentClassification
3,5f961a9bae0c19564532b966,CNP0330764,0,30,"10-hydroxy-5,9-dimethyl-15-[(3-methylbut-2-eno...",C25H36O5,416.551289,"[""plants"",""Oreoherzogia fallax"",""Ichthyothere ...",2.837158,2.158055,...,2090,69,176,83.83,0.20139,"10-hydroxy-5,9-dimethyl-15-[(3-methylbut-2-eno...",Prenol lipids,Diterpenoids,Lipids and lipid-like molecules,Kaurane diterpenoids
7,5f961a9bae0c19564532b96a,CNP0115481,0,32,"1,6,6,9a,11a-pentamethyl-1-(6-methylhepta-3,5-...",C30H48O2,440.702043,"[""notax""]",3.937131,2.325869,...,2710,74,188,40.46,0.091878,"1,6,6,9a,11a-pentamethyl-1-(6-methylhepta-3,5-...",Steroids and steroid derivatives,Cholestane steroids,Lipids and lipid-like molecules,Cholesterols and derivatives
10,5f961a9bae0c19564532b96d,CNP0151033,1,47,"7-[(acetyloxy)methyl]-4-({[4,5-dihydroxy-6-(hy...",C32H42O15,666.668134,"[""notax""]",3.253205,2.376088,...,8795,79,244,227.97,0.342168,"7-[(acetyloxy)methyl]-4-({[4,5-dihydroxy-6-(hy...",Prenol lipids,Terpene glycosides,Lipids and lipid-like molecules,Terpene glycosides
25,5f961a9cae0c19564532b97c,CNP0298418,0,54,"19-hydroxy-8,17-bis(hydroxymethyl)-1,2,8,15,17...",C41H64N2O11,760.955241,"[""plants""]",3.102837,2.178031,...,9906,134,328,218.27,0.287027,"19-hydroxy-8,17-bis(hydroxymethyl)-1,2,8,15,17...",Prenol lipids,Triterpenoids,Lipids and lipid-like molecules,Triterpenoids
29,5f961a9cae0c19564532b980,CNP0224557,0,47,"1-(acetyloxy)-1-[5-(acetyloxy)-4-{4,5,10-trihy...",C36H52O11,660.792866,"[""notax""]",3.74245,2.495217,...,7366,102,272,165.89,0.251215,"1-(acetyloxy)-1-[5-(acetyloxy)-4-{4,5,10-trihy...",Prenol lipids,Triterpenoids,Lipids and lipid-like molecules,Limonoids


## Preprocessing

In [3]:
terpene = Terpene(data=data)

# Data cleansing and feature engineering
terpene.preprocess()

terpene.data.head()


Data preprocessing finished in 0h:00m:1s.


Unnamed: 0,contains_sugar,heavy_atom_number,molecular_weight,npl_noh_score,npl_score,npl_sugar_score,number_of_carbons,number_of_nitrogens,number_of_oxygens,max_number_of_rings,...,bcutDescriptor_1,bcutDescriptor_2,bcutDescriptor_3,bcutDescriptor_4,bcutDescriptor_5,textTaxa_plants,textTaxa_marine,textTaxa_bacteria,textTaxa_fungi,chemicalSubClass
42937,0.0,0.425403,0.419676,-0.869772,-0.498512,-0.066886,0.786493,-0.390276,-0.361147,-0.54314,...,-0.245389,-0.449942,0.240168,-0.357102,-1.971594,0.0,0.0,0.0,0.0,Diradylglycerols
344415,0.0,-1.137152,-1.107608,0.928754,0.540304,0.874951,-1.29601,-0.390276,-0.509048,-0.492037,...,-0.245538,-0.298977,0.193125,-0.658767,-1.108608,0.0,0.0,0.0,0.0,Monoterpenoids
288712,0.0,-0.111725,-0.119538,0.306277,0.451287,0.797333,-0.153992,-0.390276,0.082555,-0.032113,...,-0.244959,-0.518734,0.227001,-0.618832,0.874734,1.0,0.0,0.0,0.0,Oxosteroids
317805,0.0,-0.209385,-0.234849,1.595586,1.186972,1.378648,0.04754,-0.390276,-0.656948,0.172298,...,-0.245679,-0.368269,0.190523,-0.493579,0.877609,1.0,0.0,0.0,0.0,Triterpenoids
162280,0.0,1.35317,1.372408,0.918086,0.844786,-0.968531,1.055203,-0.390276,1.857362,0.325606,...,-0.243217,-0.471545,0.56753,-0.384356,0.967057,0.0,0.0,0.0,0.0,Terpene glycosides


Starting from here, we show how to do classification of the terpenes data using classic and deep learning methods, in a manual and automatic way.

## Filtering and encoding the data

In [4]:
# Chemical subclasses of interest
selected_classes = [
    "Triterpenoids",
    "Diterpenoids",
    "Sesquiterpenoids",
    "Terpene lactones",
    "Terpene glycosides",
    "Monoterpenoids",
]

X_train, X_test, y_train, y_test = split_train_test(
    data=terpene.data,
    target="chemicalSubClass",
    selected_classes=selected_classes,
    test_size=0.2,
    random_state=RANDOM_STATE,
)

y_train_encoded, y_test_encoded, y_labels = label_encode(y_train, y_test)


# Classification using classic methods

Here, we benchmark several classic models, with their default parameters.

In [5]:
classifiers = {
    "kNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(class_weight="balanced_subsample"),
    "Naive Bayes": GaussianNB(),
    "RBF SVM": SVC(class_weight="balanced"),
    "Linear SVM": SVC(kernel="linear", class_weight="balanced"),
    "XGBoost": XGBClassifier(),
}

# Note: XGBClassifier only accepts encoded labels (y parameter)
# results_classic = eval_classification(
#     classifiers,
#     X=X_train,
#     y=y_train_encoded,
#     test_size=0.2,
#     random_state=RANDOM_STATE,
#     n_jobs=-1,
# )

# pd.DataFrame(results_classic)

XGBoost provides the best results.

Although the metrics are very close to 1, meaning it is functioning very good, we perform hyperparameter optimization to show how it works.

## Hyperparameter optimization

In [None]:
# Defining the search space
# Note: you can change the space by changing the parameters of the classifier.
def build_hypermodel(hp):
    model = XGBClassifier(
        n_estimators=hp.Choice(
            "n_estimators", [50, 100, 200, 500, 1000], default=100
        ),
        # max_depth=hp.Choice("max_depth", range(5, 30)),
        # learning_rate=hp.Float("learning_rate", 1e-4, 1e-1, sampling="log"),
        # # gamma= hp.quniform('gamma', 0, 0.50, 0.01),
        # # min_child_weight= hp.quniform('min_child_weight', 1, 10, 1),
        # subsample=hp.Float("subsample", 0.25, 1, step=0.25),
        # colsample_bytree=hp.Float("colsample_bytree", 0.25, 1, step=0.25),
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )
    return model

# space={
    # 'max_depth': hp.quniform("max_depth", 3, 18, 1),
    #     'gamma': hp.uniform ('gamma', 1,9),
    #     'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
    #     'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    #     'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    #     'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),

        # 'seed': 0
    # }

# This saves the results of the hyperparameter search to the "tuner" folder in
# the current directory.
best_model = find_best_models(
    X=X_train,
    y=y_train_encoded,
    max_trials=1,
    cv=2,
    hypermodel=build_hypermodel,
    project_name="tuner",
    overwrite=True,
    random_state=RANDOM_STATE,
    num_models=1
)

best_model
