# Training a Machine Learning Classifier for HP-PPI Prediction Task

Classifier: XGBoost
- performs well with sparse variables
- needs no preprocessing

In [1]:
import os
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from scikitplot.metrics import plot_confusion_matrix, plot_roc
from hyperopt import hp, tpe, STATUS_OK, Trials, space_eval
from hyperopt.fmin import fmin

from IPython.display import display, Markdown

# Suppress warnings
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.simplefilter('ignore', category=(UndefinedMetricWarning, RuntimeWarning))

In [2]:
# Set up directories
parent_dir = os.path.dirname(os.getcwd())

dir_in = os.path.join(parent_dir, 'data', 'features')
dir_out = os.path.join(parent_dir, 'data', 'results')

In [3]:
# Function for combining datasets
def get_dataset(pathogens):
    '''Combine features and labels of different pathogen datasets'''
    
    X = []
    y = []
    
    # Iterate through datasets
    for pathogen in pathogens:
        f_in = os.path.join(dir_in, '%s_features.pkl' % pathogen)
        X_, y_ = joblib.load(f_in)
        X.append(X_)
        y.append(y_)
    
    # Combine features (X) and labels (y)
    X = sparse.vstack(X)
    y = np.concatenate(y)
    
    return X, y

## Hyperparameter Tuning

Bayesian optimization with `hyperopt`

In [4]:
# Load all datasets
pathogens = ['Bacan', 'Yerpe', 'Fratu']
X, y = get_dataset(pathogens)
print('Loaded %i samples with %i features' % X.shape)

# Split into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=7)

pfam_acc = joblib.load('pfam.pkl')[1]
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=pfam_acc)
dvalid = xgb.DMatrix(X_valid, label=y_valid, feature_names=pfam_acc)

Loaded 34955 samples with 4456 features


In [5]:
# Define eval function: F1-score
def f1_eval(y, dtrain):
    y_true = dtrain.get_label()
    y_pred = [1. if y_i > 0.5 else 0. for y_i in y] # binarize output
    
    f1 = f1_score(y_true, y_pred)
    return 'f1', f1

# Define objective function
def objective(params):
    clf = xgb.train(params, dtrain, num_boost_round=1000, evals=[(dvalid, 'eval')],
                    feval=f1_eval, maximize=True, early_stopping_rounds=100,
                    verbose_eval=False)
    
    y_pred = clf.predict(dvalid, ntree_limit=clf.best_ntree_limit)
    score = f1_eval(y_pred, dvalid)[1]
    
    return {'loss': -score, 'status': STATUS_OK}

In [6]:
# Define hyperparameter search space
param_space = {
    'max_depth': hp.choice('max_depth', range(1, 51)),
    'learning_rate': hp.uniform('learning_rate', 0.001, 1.0),
    'min_child_weight': hp.choice('min_child_weight', range(31)),
    'max_delta_step': hp.choice('max_delta_step', range(21)),
    'gamma': hp.uniform('gamma', 0, 10),
    
    'subsample': hp.uniform('subsample', 0.2, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1),
    
    'reg_alpha': hp.uniform('reg_alpha', 0, 10),
    'reg_lambda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0.1, 10)
}

# Begin optimization
trials = Trials()
best = fmin(objective, param_space, algo=tpe.suggest, max_evals=100, trials=trials)

# Obtain parameters of best model
best_params = space_eval(param_space, best)
best_params

 51%|█████     | 51/100 [22:20<33:41, 41.25s/it, best loss: -0.7079432195711266]


KeyboardInterrupt: 

In [None]:
# Set up classifier
clf = xgb.XGBClassifier(**best_params, n_estimators=1000, n_jobs=-1)

# Train with validation
_ = clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='auc',
            early_stopping_rounds=100, verbose=1)

# Dump model as pickle
_ = joblib.dump(clf, 'best_model.pkl')

## Evaluate model on partitioned datasets

In [None]:
# Train and test classifier on different datasets
## Partition datasets by pathogen species
i = 0 # track number of tests

for p_test in pathogens:
    p_train = [p for p in pathogens if p != p_test]
    X_train, y_train = get_dataset(p_train)
    
    # Train classifier with validation set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8,
                                                      random_state=7)
    
    _ = clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc',
                early_stopping_rounds=100, verbose=0)
    
    # Evaluate performance on test set
    X_test, y_test = get_dataset([p_test])
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)
    
    i += 1
    
    print('Test %i'.center(70) % i)
    print('Test Pathogen: %s\n'.center(70) % p_test)
    print(classification_report(y_test, y_pred, digits=4))
    
    # Plot metrics
    _ = plot_confusion_matrix(y_test, y_pred)
    _ = plot_roc(y_test, y_proba)
    _ = plt.show()
    
    display(Markdown('<hr></hr>'))