# Evaluate the NextFlu substitution model
We will evaluate the performance of the adapted NextFlu substitution model over 14 test seasons from 2014NH to 2020SH.

## Imports

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
import utilities   # self defined functions
from NextFlu_substitutionModel import nextflu_train, nextflu_predict

# for reproduciblility, fix the randomly generated numbers
SEED = 100
random.seed(SEED)
np.random.seed(SEED)

## Variables

In [None]:
mut_mat      = 'binary'   # mutation matrix
Test_Seasons = [str(year)+s for year in range (2014, 2021) for s in ["NH", "SH"]]   # seasons from 2014NH to 2020SH
metadata     = 'a+p'   # label to record which metadata is being used
model_name   = 'NextFlu'   # the type of model to be used

## Paths and filenames

In [2]:
# paths
path_data   = "../data/"   # path of data
path_result = "../results/SuppFig6_comparison/"   # results will be saved in this directory
Path(path_result).mkdir(parents=True, exist_ok=True)   # make directory if it does not exist already

# filenames
data_fn   = path_data + "nhts_ha1_binary.csv"   # input data
test_fn   = path_result + "testScores.csv"   # to save performance scores for test dataset
output_fn = path_result + "output_test_NextFlu.csv"   # to save virus-antiserum info., actual and predicted NHTs

## Read data
- Binary encoded genetic difference (seq_diff) (not used in NextFlu substitution model)

In [None]:
data = pd.read_csv(data_fn)

## Performance of the model
- For each test season from 2014NH to 2020SH
    - Split the data into training and test datasets
    - Train and evaluate the model
    - Compute predictive performance
    - Save the predictions
- Compute average predictive performance

In [None]:
# to collect actuals and predictions for micro-averaged scores over all test seasons
actual_all  = {}
predict_all = {}
output      = pd.DataFrame()

# loop through test seasons
for test_season in Test_Seasons:
    print("Test Season: ", test_season)

    '''
    Train Test Split
        - based on seasonal framework
        - Train: past virus isolates paired with past sera
        - Test: circulating virus isolates paired with past sera
    '''
    ind_train, ind_test = utilities.seasonal_trainTestSplit(data[['virus', 'serum', 'virusDate', 'serumDate']], test_season)
    
    # training dataset
    data_train = data.iloc[ind_train].copy()
    data_train.reset_index(drop=True, inplace=True)
    
    # test dataset
    data_test = data.iloc[ind_test].copy()
    data_test.reset_index(drop=True, inplace=True)

    
    '''
    Training and evaluation
    '''
    mutation_effects, serum_potency, virus_effect = nextflu_train(data_train.copy(),
                                                                  lam_HI=1,
                                                                  lam_pot=0.2,
                                                                  lam_avi=2)
    
    pred_nextflu = nextflu_predict(data_test.copy(), mutation_effects, serum_potency, virus_effect)
    
   
    '''
    Test Scores
    '''
    cols       = ['mut_mat', 'model', 'metadata', 'season']
    col_values = [mut_mat, model_name, metadata, test_season]
    utilities.compute_scores(data_test.nht.values, pred_nextflu, test_fn, col=cols, col_val=col_values)
    
    
    '''
    Save actuals and predictions
    '''
    actual_all[f'{test_season}']  = data_test.nht.values
    predict_all[f'{test_season}'] = pred_nextflu
    
    
    '''
    Save data info. and output
    '''
    output_tmp = data_test[["virus", "virusPassCat", "virusDate",
                            "serum", "serumPassCat", "serumDate",
                            "nht"]].copy()
    output_tmp.loc[:, "predict"]      = pred_nextflu
    output_tmp.loc[:, "variant"]      = data_test["variant"].values
    output_tmp.loc[:, "pred_variant"] = (pred_nextflu > 2) * 1
    output_tmp.loc[:, "season"]       = test_season
    
    output = pd.concat((output, output_tmp), ignore_index=True)
    
    #################
    # End season loop
    #################


'''
Micro-average over seasons
'''
actual  = np.concatenate(list(actual_all.values()))
predict = np.concatenate(list(predict_all.values()))

# Test scores
cols       = ['mut_mat', 'model', 'metadata', 'season']
col_values = [mut_mat, model_name, metadata, 'Average']
utilities.compute_scores(actual, predict, test_fn, col=cols, col_val=col_values)


'''
Save output
'''
output.to_csv(output_fn, index=False)


print("Training and testing completed")