In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from xgboost import XGBClassifier

### Read data file 


In [2]:
def read_data(file):
    """Returns a np array from file. Each row is padded with -999.0 in order
    to have the same length as the row with the maximun number of NMR"""

    X = []
    y = []
    for line in file:
        spectra = [float(nmr) for nmr in line.split(", ")[:-1]]
        X.append(np.array(spectra))
        y.append(np.array(int(line.split(", ")[-1].strip())))

    X = np.array(X, dtype=object)
    y = np.array(y)

    return (X, y)

In [3]:
file = open("../data/alkaloids.csv")

In [4]:
X, y = read_data(file)

In [5]:
X

array([[11.2, 17.0, 25.0, ..., -999.0, -999.0, -999.0],
       [17.3, 30.9, 48.5, ..., -999.0, -999.0, -999.0],
       [14.1, 14.8, 16.0, ..., -999.0, -999.0, -999.0],
       ...,
       [12.9, 17.9, 18.3, ..., -999.0, -999.0, -999.0],
       [7.0, 15.5, 18.9, ..., -999.0, -999.0, -999.0],
       [15.4, 17.1, 17.3, ..., -999.0, -999.0, -999.0]], dtype=object)

In [6]:
X.shape

(17376, 37)

Thus, the max number of NMR spectra = 37

#### Now, we get the alkaloids spectra with 37 spectra:

In [7]:
# Get positive indexes
pos_idcs = y == 1

In [8]:
X_pos = X[pos_idcs]  # Alkaloids NMR

In [9]:
len(X_pos)

137

In [10]:
[x for x in X_pos if not -999.0 in x]

[array([15.6, 18.6, 19.5, 20.5, 25.8, 26.5, 27.3, 27.4, 27.7, 29.4, 30.2,
        36.5, 37.0, 37.6, 42.4, 49.9, 52.3, 52.6, 61.8, 64.7, 65.8, 71.4,
        71.5, 73.6, 78.0, 108.3, 109.3, 112.3, 116.3, 121.1, 123.4, 129.7,
        131.3, 138.9, 141.1, 149.2, 152.0], dtype=object),
 array([18.6, 19.0, 19.7, 20.3, 21.4, 24.7, 26.9, 28.9, 30.6, 31.1, 35.1,
        43.6, 47.0, 50.1, 52.7, 58.8, 61.9, 66.1, 66.3, 72.0, 72.4, 74.7,
        76.1, 78.2, 81.0, 107.1, 111.6, 111.9, 120.6, 122.0, 124.6, 125.8,
        133.3, 139.7, 143.3, 149.5, 154.4], dtype=object)]

In the inference time, if the input has more than 37 spectra it will be predicted as non-alkaloid. 

### Model evaluation strategy:
* Validation set = 30%
* 70% for resampling and hyperparameter search (CV 5 fold)

In [11]:
kf = 5

In [12]:
test_size = 0.3

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=0, stratify=y
)

### Resampling strategy:
 1.- Random under sampling of the majority class
 
 2.- Combination of over and under sampling using SMOTEENN

We choose 10% (sampling strategy = 0.1) of positive class as the final imbalance as we do not want to affect too much the original distribution of the classes.

In [14]:
under_sampler = RandomUnderSampler(sampling_strategy=0.1, random_state=0)

In [15]:
X_under, y_under = under_sampler.fit_resample(X_train, y_train)

In [16]:
X_under.shape

(1056, 37)

In [17]:
resampler = SMOTEENN(sampling_strategy=0.1, random_state=0)

In [18]:
X_res, y_res = resampler.fit_resample(X_train, y_train)

In [19]:
# new number of positive samples
len(y_res[y_res == 1])

1109

In [20]:
X_res.shape

(12936, 37)

In [21]:
space = {
    "max_depth": hp.quniform("max_depth", 3, 18, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0, 10),
    "reg_lambda": hp.uniform("reg_lambda", 0, 10),
    "min_child_weight": hp.quniform("min_child_weight", 0, 5, 1),
    "n_estimators": hp.quniform("n_estimators", 80, 320, 20),
    "seed": hp.choice("seed", [22, 44, 66]),
    "eta": hp.uniform("eta", 0.1, 0.9),
}

In [22]:
def objective(space, kf=kf, X=X_under, y=y_under):

    clf = XGBClassifier(
        max_depth=int(space["max_depth"]),
        reg_alpha=space["reg_alpha"],
        reg_lambda=space["reg_lambda"],
        min_child_weight=int(space["min_child_weight"]),
        n_estimators=int(space["n_estimators"]),
        seed=space["seed"],
        eta=space["eta"],
    )

    cv = StratifiedKFold(random_state=22, n_splits=kf, shuffle=True)
    score = cross_val_score(clf, X, y, cv=cv, scoring="roc_auc", n_jobs=-1).mean()
    print(f"SCORE: {score}")

    return {"loss": -score, "status": STATUS_OK}

In [23]:
trials = Trials()

best_hyperparams = fmin(
    fn=objective, space=space, algo=tpe.suggest, max_evals=20, trials=trials
)

SCORE: 0.8612787828947368                             
SCORE: 0.8600685307017544                                                        
SCORE: 0.9013925438596491                                                        
SCORE: 0.874575109649123                                                         
SCORE: 0.8682894736842105                                                        
SCORE: 0.8678549890350876                                                        
SCORE: 0.8820668859649123                                                        
SCORE: 0.8612308114035088                                                        
SCORE: 0.8795038377192983                                                        
SCORE: 0.864797149122807                                                         
SCORE: 0.8888089364035088                                                         
SCORE: 0.872501370614035                                                          
SCORE: 0.8878440241228072                

In [24]:
trials.best_trial

{'state': 2,
 'tid': 14,
 'spec': None,
 'result': {'loss': -0.9088473135964911, 'status': 'ok'},
 'misc': {'tid': 14,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'eta': [14],
   'max_depth': [14],
   'min_child_weight': [14],
   'n_estimators': [14],
   'reg_alpha': [14],
   'reg_lambda': [14],
   'seed': [14]},
  'vals': {'eta': [0.34993945793171993],
   'max_depth': [9.0],
   'min_child_weight': [4.0],
   'n_estimators': [140.0],
   'reg_alpha': [0.7144615122380271],
   'reg_lambda': [2.26985485325398],
   'seed': [2]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2022, 9, 29, 3, 23, 51, 684000),
 'refresh_time': datetime.datetime(2022, 9, 29, 3, 23, 53, 857000)}

#### Now, using the resampled set with over and under sampling

In [25]:
def objective(space, kf=kf, X=X_res, y=y_res):

    clf = XGBClassifier(
        max_depth=int(space["max_depth"]),
        reg_alpha=space["reg_alpha"],
        reg_lambda=space["reg_lambda"],
        min_child_weight=int(space["min_child_weight"]),
        n_estimators=int(space["n_estimators"]),
        seed=space["seed"],
        eta=space["eta"],
    )

    cv = StratifiedKFold(random_state=22, n_splits=kf, shuffle=True)
    score = cross_val_score(clf, X, y, cv=cv, scoring="roc_auc", n_jobs=-1).mean()
    print(f"SCORE: {score}")

    return {"loss": -score, "status": STATUS_OK}

In [26]:
trials = Trials()

best_hyperparams = fmin(
    fn=objective, space=space, algo=tpe.suggest, max_evals=20, trials=trials
)

SCORE: 0.9955438213934722                             
SCORE: 0.9970728291393021                                                        
SCORE: 0.9987200775148711                                                        
SCORE: 0.9976912787339455                                                        
SCORE: 0.9980461852913063                                                        
SCORE: 0.9954185657528731                                                        
SCORE: 0.9988911662063462                                                        
SCORE: 0.9971198607666443                                                        
SCORE: 0.9990222825794589                                                        
SCORE: 0.9989338056050592                                                        
SCORE: 0.9977864806927694                                                         
SCORE: 0.9969122830434642                                                         
SCORE: 0.9959227210212729                

In [27]:
trials.best_trial

{'state': 2,
 'tid': 8,
 'spec': None,
 'result': {'loss': -0.9990222825794589, 'status': 'ok'},
 'misc': {'tid': 8,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'eta': [8],
   'max_depth': [8],
   'min_child_weight': [8],
   'n_estimators': [8],
   'reg_alpha': [8],
   'reg_lambda': [8],
   'seed': [8]},
  'vals': {'eta': [0.5076577956543429],
   'max_depth': [16.0],
   'min_child_weight': [5.0],
   'n_estimators': [100.0],
   'reg_alpha': [0.2603384226801031],
   'reg_lambda': [4.327340572731117],
   'seed': [2]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2022, 9, 29, 3, 25, 49, 190000),
 'refresh_time': datetime.datetime(2022, 9, 29, 3, 25, 59, 747000)}