# Performance of the model over validation seasons
We will collect the predictions of the proposed RF model on the validation dataset for four seasons from 2012NH to 2013SH. These predictions will be saved and later used to optimize the classification threshold.

## Imports

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import utilities   # self defined functions
import model_utilities   # self defined models
from ast import literal_eval
from sklearn.preprocessing import OneHotEncoder

# (for reproduciblility) fix the randomly generated numbers
SEED = 100
np.random.seed(SEED)

## Variables

In [2]:
mut_mat       = "AZAE970101"   # mutation matrix
Valid_Seasons = ['2012NH', '2012SH', '2013NH', '2013SH'] # seasons from 2012NH to 2013SH

HA1_features  = [f"HA1_{x}" for x in range(1,329+1)]
meta_features = [
                 'virus',   # virus avidity
                 'serum',   # antiserum potency
                 'virusPassCat',
                 'serumPassCat'
                 ]   # metadata features

metadata   = 'a+p+vPC+sPC'   # label to record which metadata is being used
model_name = 'optimized_RF'   # the type of model to be used

## Paths and filenames

In [3]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/SuppFig5b_optimize_threshold/"   # results will be saved in this directory
Path(path_result).mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
data_fn   = path_data + f"nhts_ha1_{mut_mat}.csv"   # input data
valid_fn  = path_result + "validScores.csv"   # to save performance scores for validation dataset
output_fn = path_result + f"output_validSeasons_{mut_mat}.csv"   # to save virus-antiserum info., actual and predicted NHTs

## Read data
- Genetic difference (seq_diff) encoded as per the mutation matrix
- Converter is used to load the genetic difference saved as a list of floats

In [4]:
data = pd.read_csv(data_fn, converters={"seq_diff": literal_eval})

## Performance of the model
- For each validation season from 2012NH to 2013SH
    - Split the data into training and validation datasets
    - Prepare encoded inputs (genetic difference and metadata features)
    - Train and evaluate the model
    - Compute predictive performance
    - Save the predictions
- Compute average predictive performance

In [5]:
# to collect actuals and predictions for micro-averaged scores over all validation seasons
actual_all  = {}
predict_all = {}
output      = pd.DataFrame()

# loop through each validation season
for valid_season in Valid_Seasons:
    print("Validation Season: ", valid_season)

    '''
    Train Test Split
        - based on seasonal framework
        - Train: past virus isolates paired with past sera
        - Test: circulating virus isolates paired with past sera
    '''
    ind_train, ind_valid = utilities.seasonal_trainTestSplit(data.copy(), valid_season)
    
    # training dataset
    data_train = data.iloc[ind_train].copy()
    data_train.reset_index(drop=True, inplace=True)
    
    # validation dataset
    data_valid = data.iloc[ind_valid].copy()
    data_valid.reset_index(drop=True, inplace=True)


    '''
    Input features (genetic difference)
    '''
    # training dataset
    X_train = pd.DataFrame(data_train.seq_diff.to_list(),
                           index=data_train.index,
                           columns=HA1_features)
    X_train.fillna(0, inplace=True)   # replace nan with 0
    
    # validation dataset
    X_valid = pd.DataFrame(data_valid.seq_diff.to_list(),
                           index=data_valid.index,
                           columns=HA1_features)
    X_valid.fillna(0, inplace=True)   # replace nan with 0
    
    
    '''
    Input features (metadata features)
    '''
    X_train_meta = data_train[meta_features].fillna('None').astype('str')
    X_valid_meta = data_valid[meta_features].fillna('None').astype('str')
    
    
    # one hot encoding
    ohe = OneHotEncoder(handle_unknown='ignore')
    X_train_meta = ohe.fit_transform(X_train_meta).toarray()
    X_valid_meta = ohe.transform(X_valid_meta).toarray()
      
    X_train = np.hstack((X_train.values, X_train_meta))
    X_valid = np.hstack((X_valid.values, X_valid_meta))


    del X_train_meta, X_valid_meta
        
    
    '''
    Training and evaluation
    '''
    # optimized model
    model = getattr(model_utilities, f"model_{model_name}")
    results = model(X_train,
                    data_train.nht.values,
                    X_test = X_valid)
    
   
    '''
    Validation Scores
    '''
    cols       = ['mut_mat', 'model', 'metadata', 'season']
    col_values = [mut_mat, model_name, metadata, valid_season]
    utilities.compute_scores(data_valid.nht.values, results['pred_test'], valid_fn, col=cols, col_val=col_values)
    
    
    '''
    Save actuals and predictions
    '''
    actual_all[f'{valid_season}']  = data_valid.nht.values
    predict_all[f'{valid_season}'] = results['pred_test']
    
    
    '''
    Save data info. and output
    '''
    output_tmp = data_valid[["virus", "virusPassCat", "virusDate",
                             "serum", "serumPassCat", "serumDate",
                             "nht"]].copy()
    output_tmp.loc[:, "predict"]      = results['pred_test']
    output_tmp.loc[:, "variant"]      = data_valid["variant"].values
    output_tmp.loc[:, "pred_variant"] = (results['pred_test'] > 2) * 1
    output_tmp.loc[:, "season"]       = valid_season
    
    output = pd.concat((output, output_tmp), ignore_index=True)
    
    #################
    # End season loop
    #################


'''
Micro-average over seasons
'''
actual  = np.concatenate(list(actual_all.values()))
predict = np.concatenate(list(predict_all.values()))

# Validation scores
col_values = [mut_mat, model_name, metadata, 'Average']
utilities.compute_scores(actual, predict, valid_fn, col=cols, col_val=col_values)


'''
Save output
'''
output.to_csv(output_fn, index=False)


print("Training and validation completed")

Validation Season:  2012NH
Time for training: 0.9129602909088135
Validation Season:  2012SH
Time for training: 3.0288445949554443
Validation Season:  2013NH
Time for training: 12.691310405731201
Validation Season:  2013SH
Time for training: 30.855534076690674
Training and validation completed
