# Introduction

This tutorial shows how to classify terpenes using classic as well as deep learning classification methods.

The terpenes data is a subset of the [COCONUT](https://coconut.naturalproducts.net) dataset (version March 2021), which is obtained by filtering COCONUT's ``chemicalSuperClass`` column to include only ``Lipids and lipid-like molecules``.

# Setup

In [1]:
import napr
from napr.data import load_terpene
from napr.apps import Terpene

napr.__version__

'0.1.5'

# Data

In [2]:
# Download and load the terpene dataset.
# Note: the dataset, 'terpene-21.3.bz2', is saved by default to the current
# directory.
data = load_terpene(download=True)  # load_terpene() if downloaded already

data

Downloading terpene-21.3.bz2: 100%|██████████| 20.3M/20.3M [00:01<00:00, 10.7MiB/s]


Unnamed: 0,_id,coconut_id,contains_sugar,heavy_atom_number,name,molecular_formula,molecular_weight,textTaxa,npl_noh_score,npl_score,...,weinerPathNumber,weinerPolarityNumber,zagrebIndex,topoPSA,tpsaEfficiency,iupac_name,chemicalClass,chemicalSubClass,chemicalSuperClass,directParentClassification
3,5f961a9bae0c19564532b966,CNP0330764,0,30,"10-hydroxy-5,9-dimethyl-15-[(3-methylbut-2-eno...",C25H36O5,416.551289,"[""plants"",""Oreoherzogia fallax"",""Ichthyothere ...",2.837158,2.158055,...,2090,69,176,83.83,0.201390,"10-hydroxy-5,9-dimethyl-15-[(3-methylbut-2-eno...",Prenol lipids,Diterpenoids,Lipids and lipid-like molecules,Kaurane diterpenoids
7,5f961a9bae0c19564532b96a,CNP0115481,0,32,"1,6,6,9a,11a-pentamethyl-1-(6-methylhepta-3,5-...",C30H48O2,440.702043,"[""notax""]",3.937131,2.325869,...,2710,74,188,40.46,0.091878,"1,6,6,9a,11a-pentamethyl-1-(6-methylhepta-3,5-...",Steroids and steroid derivatives,Cholestane steroids,Lipids and lipid-like molecules,Cholesterols and derivatives
10,5f961a9bae0c19564532b96d,CNP0151033,1,47,"7-[(acetyloxy)methyl]-4-({[4,5-dihydroxy-6-(hy...",C32H42O15,666.668134,"[""notax""]",3.253205,2.376088,...,8795,79,244,227.97,0.342168,"7-[(acetyloxy)methyl]-4-({[4,5-dihydroxy-6-(hy...",Prenol lipids,Terpene glycosides,Lipids and lipid-like molecules,Terpene glycosides
25,5f961a9cae0c19564532b97c,CNP0298418,0,54,"19-hydroxy-8,17-bis(hydroxymethyl)-1,2,8,15,17...",C41H64N2O11,760.955241,"[""plants""]",3.102837,2.178031,...,9906,134,328,218.27,0.287027,"19-hydroxy-8,17-bis(hydroxymethyl)-1,2,8,15,17...",Prenol lipids,Triterpenoids,Lipids and lipid-like molecules,Triterpenoids
29,5f961a9cae0c19564532b980,CNP0224557,0,47,"1-(acetyloxy)-1-[5-(acetyloxy)-4-{4,5,10-trihy...",C36H52O11,660.792866,"[""notax""]",3.742450,2.495217,...,7366,102,272,165.89,0.251215,"1-(acetyloxy)-1-[5-(acetyloxy)-4-{4,5,10-trihy...",Prenol lipids,Triterpenoids,Lipids and lipid-like molecules,Limonoids
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401615,5f9651e6ae0c19564538da32,CNP0097739,0,69,"8-cyclopentyl-17-[2-(3,3-dimethyloxiran-2-yl)-...",C61H86N2O6,943.347631,"[""plants""]",2.820072,2.049152,...,19362,150,414,111.63,0.118422,"8-cyclopentyl-17-[2-(3,3-dimethyloxiran-2-yl)-...",Prenol lipids,Triterpenoids,Lipids and lipid-like molecules,Triterpenoids
401616,5f9651e6ae0c19564538da33,CNP0251477,0,19,Armatin E,C16H24O3,264.360567,"[""Nephthea armata"",""marine""]",3.134144,2.178648,...,598,38,108,35.53,0.134495,"2-methoxy-1,9,9a-trimethyl-1H,2H,3aH,4H,5H,7H,...",Prenol lipids,Sesquiterpenoids,Lipids and lipid-like molecules,Sesquiterpenoids
401619,5f9651e6ae0c19564538da36,CNP0298184,0,25,"8-hydroxy-3,6,10-trimethyl-2-oxo-2H,3H,3aH,4H,...",C20H28O5,348.434084,"[""notax""]",3.409606,2.281494,...,1372,41,126,72.83,0.209165,"8-hydroxy-3,6,10-trimethyl-2-oxo-2H,3H,3aH,4H,...",Prenol lipids,Terpene lactones,Lipids and lipid-like molecules,Germacranolides and derivatives
401621,5f9651e7ae0c19564538da38,CNP0298814,0,67,Eryloside J,C50H81NO16,952.177178,"[""Erylus nobilis""]",3.108389,2.137312,...,21158,142,378,263.39,0.276799,"1-(6,6-dimethyl-5-methylideneheptan-2-yl)-7-[(...",Prenol lipids,Triterpenoids,Lipids and lipid-like molecules,Triterpenoids


# Preprocessing

In [3]:
terpene = Terpene(data=data)
terpene.preprocess()

terpene.data

Data preprocessing finished in 0h:00m:1s.


Unnamed: 0,contains_sugar,heavy_atom_number,molecular_weight,npl_noh_score,npl_score,npl_sugar_score,number_of_carbons,number_of_nitrogens,number_of_oxygens,max_number_of_rings,...,bcutDescriptor_1,bcutDescriptor_2,bcutDescriptor_3,bcutDescriptor_4,bcutDescriptor_5,textTaxa_plants,textTaxa_marine,textTaxa_bacteria,textTaxa_fungi,chemicalSubClass
42937,0.0,0.425403,0.419676,-0.869772,-0.498512,-0.066886,0.786493,-0.390276,-0.361147,-0.543140,...,-0.245389,-0.449942,0.240168,-0.357102,-1.971594,0.0,0.0,0.0,0.0,Diradylglycerols
344415,0.0,-1.137152,-1.107608,0.928754,0.540304,0.874951,-1.296010,-0.390276,-0.509048,-0.492037,...,-0.245538,-0.298977,0.193125,-0.658767,-1.108608,0.0,0.0,0.0,0.0,Monoterpenoids
288712,0.0,-0.111725,-0.119538,0.306277,0.451287,0.797333,-0.153992,-0.390276,0.082555,-0.032113,...,-0.244959,-0.518734,0.227001,-0.618832,0.874734,1.0,0.0,0.0,0.0,Oxosteroids
317805,0.0,-0.209385,-0.234849,1.595586,1.186972,1.378648,0.047540,-0.390276,-0.656948,0.172298,...,-0.245679,-0.368269,0.190523,-0.493579,0.877609,1.0,0.0,0.0,0.0,Triterpenoids
162280,0.0,1.353170,1.372408,0.918086,0.844786,-0.968531,1.055203,-0.390276,1.857362,0.325606,...,-0.243217,-0.471545,0.567530,-0.384356,0.967057,0.0,0.0,0.0,0.0,Terpene glycosides
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227606,0.0,-1.381301,-1.345976,-1.536997,-1.535827,-0.971972,-1.497542,-0.390276,-0.804849,-0.543140,...,-0.245825,1.511916,0.254906,-0.129473,-2.004184,1.0,0.0,0.0,0.0,Fatty acid esters
56220,0.0,-0.307044,-0.323454,1.499374,1.012819,1.268867,-0.086815,-0.390276,-0.656948,0.172298,...,-0.245680,-0.510236,-1.819423,1.053818,0.904475,0.0,0.0,0.0,0.0,Triterpenoids
269486,0.0,-0.160555,-0.160117,1.155807,1.002888,1.347790,-0.086815,-0.390276,-0.213247,0.172298,...,-0.245247,-0.523881,-1.740421,1.099685,1.075874,0.0,0.0,0.0,0.0,Triterpenoids
349333,0.0,1.109021,1.158810,-0.722527,-0.741235,-0.317936,1.391091,0.543299,0.082555,-0.543140,...,1.897255,0.659092,3.478055,-2.275868,-1.737645,0.0,0.0,0.0,0.0,Glycerophosphocholines


# Classification using classic methods

# Classification using deep learning