# Trained model

We will train the optimized RF model with all four metadata features and top mutation matrix over all the available data from 2003NH to 2020SH and save it.

## Imports

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import utilities   # self defined functions
import model_utilities   # self defined models
from ast import literal_eval
from sklearn.preprocessing import OneHotEncoder
import joblib

# (for reproduciblility) fix the randomly generated numbers
SEED = 100
np.random.seed(SEED)

## Variables

In [2]:
mut_mat       = "AZAE970101"   # mutation matrix

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 'virus',   # virus avidity
                 'serum',   # antiserum potency
                 'virusPassCat',
                 'serumPassCat'
                 ]   # metadata features

metadata   = 'a+p+vPC+sPC'   # label to record which metadata is being used
model_name = 'optimized_RF'   # the type of model to be used

## Paths and filenames

In [3]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/Fig2_performance_RF/"   # results will be saved in this directory
Path(path_result).mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
data_fn          = path_data + f"nhts_ha1_{mut_mat}.csv"   # input data
trained_model_fn = path_result + "optimized_rf_model_trainStart2003NH_trainEnd2020SH.joblib"   # to save trained model

## Read data
- Genetic difference (seq_diff) encoded as per the mutation matrix
- Converter is used to load the genetic difference saved as a list of floats

In [4]:
data = pd.read_csv(data_fn, converters={"seq_diff": literal_eval})

## Train the model
- Consider the test season as 2021NH
    - Split the data into training and test datasets
    - Prepare encoded inputs (genetic difference and metadata features)
    - Train and save the model

In [6]:
'''
Train Test Split
    - based on seasonal framework
    - Train: past virus isolates paired with past sera
    - Test: circulating virus isolates paired with past sera
'''
ind_train, _ = utilities.seasonal_trainTestSplit(data.copy(), "2021NH")

# training dataset
data_train = data.iloc[ind_train].copy()
data_train.reset_index(drop=True, inplace=True)


'''
Input features (genetic difference)
'''
# training dataset
X_train = pd.DataFrame(data_train.seq_diff.to_list(),
                       index=data_train.index,
                       columns=HA1_features)
X_train.fillna(0, inplace=True)   # replace nan with 0


'''
Input features (metadata features)
'''
X_train_meta = data_train[meta_features].fillna('None').astype('str')

# one hot encoding
ohe = OneHotEncoder(handle_unknown='ignore')
X_train_meta = ohe.fit_transform(X_train_meta).toarray()

X_train = np.hstack((X_train.values, X_train_meta))

del X_train_meta


'''
Training
'''
# optimized model
model = getattr(model_utilities, f"model_{model_name}")
results = model(X_train,
                data_train.nht.values,
               X_test=X_train) # we will not use these predictions, just passed to avoid argument error

'''
save RF model trained on data from 2003NH to 2020SH
'''
joblib.dump(results['model'], trained_model_fn)

Time for training: 303.0454161167145


['../results/Fig2_performance_RF/optimized_rf_model_trainStart2003NH_trainEnd2020NH.joblib']