# Convert flat HI titre table to matrix form
We compiled HI titre data from [Crick WIC](https://www.crick.ac.uk/partnerships/worldwide-influenza-centre/annual-and-interim-reports) in a flat table format (as used by [Bedford *et. al.* 2014](https://doi.org/10.7554/eLife.01914)). Here, we will convert it to HI titre matrix format (as used by [Smith *et. al.* 2002](https://doi.org/10.1126/science.1097211)) because the [RACMACS](https://acorg.github.io/Racmacs/) package takes input in this form.

## Imports

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

## Paths and filenames

In [2]:
# paths
path_data   = "../data/"   # path of data

# filenames
titer_fn  = path_data + "flat_HI_table.csv"   # HI titer table
nht_fn    = path_data + "nhts_ha1.csv"   # NHT table
matrix_fn = path_data + "titre_matrix.csv"   # to save HI titre matrix

## Read and preprocess HI titer table

In [3]:
'''
Read HI titers table
'''
titer_table = pd.read_csv(titer_fn)

# get relevant fields
titer_table = titer_table[["virusName", "virusPassage", "virusDate",
                           "serumName", "serumPassage", "serumDate",
                           "titer"]]


'''
labels virus and serum
'''
# as the fields corresponding to passage could include nan values
# so, convert them to string 'nan'
titer_table.virusPassage.replace(np.nan, 'nan', inplace=True)
titer_table.serumPassage.replace(np.nan, 'nan', inplace=True)

# combine metadata fields with a separator '=' and get relevant field columns
titer_table['virus'] = titer_table.virusName + '=' + titer_table.virusPassage
titer_table['serum'] = titer_table.serumName + '=' + titer_table.serumPassage

## Read NHT table and find unique isolates

In [4]:
nht_table = pd.read_csv(nht_fn)

viruses  = nht_table.virus.unique()
sera     = nht_table.serum.unique()
isolates = pd.DataFrame(np.concatenate((viruses, sera)), columns=["isolates"])
isolates = isolates.drop_duplicates(["isolates"])

## Keep titers that match unique isolates in NHT table

In [5]:
titer_table = titer_table[titer_table.virus.isin(isolates.isolates) & titer_table.serum.isin(isolates.isolates)]

## Assign seasons and keep data from 2003NH to 2020SH

In [6]:
def assign_season(date):
    # if less than 31st January, then NH season
    if date[5:] <= "01-31":
        return date[:4] + "NH"
    # if between 1st February and 31st August, then SH
    elif (date[5:] >= "02-01") and (date[5:] <= "08-31"):
        return date[:4] + "SH"
    # if between 1st September and 31st December, then next year's NH
    elif (date[5:] >= "09-01") and (date[5:] <= "12-31"):
        season_year = int(date[:4])
        return str(season_year+1) +"NH"
    else:
        return None

titer_table['season'] = titer_table.virusDate.apply(assign_season)


'''
keep data from 2003NH to 2020SH
'''
seasons = [str(year)+s for year in range(2003, 2021) for s in ["NH", "SH"]]

titer_table = titer_table[titer_table.season.isin(seasons)]

## Mean titer value of duplicate virus-antiserum pairs (repeated HI assays)
- For the same virus-antiserum pairs, the repeated HI assays will result in multiple HI titre values
- We will compute the mean HI titre value for such repeated HI assays

In [7]:
titer_table["titer"] = np.log2(titer_table.titer)
titers_group = titer_table.groupby(["virus", "serum"], as_index=False)
titer_table  = titers_group.mean()
titer_table["titer"] = np.power(2, titer_table.titer)

titer_table['virusDate'] = titers_group.agg({('virusDate'): max}).virusDate.to_list()
titer_table['serumDate'] = titers_group.agg({('serumDate'): max}).serumDate.to_list()

## Combine metadata fields with a separator '=' and get column fields of virus, serum, and titer

In [8]:
titer_table['virus'] = titer_table.virusDate + '=' + titer_table.virus
titer_table['serum'] = titer_table.serumDate + '=' + titer_table.serum

titer_table = titer_table[["virus", "serum", "titer"]]

## Convert to matrix form

In [9]:
# set virus and serum as index
titer_table.set_index(['virus', 'serum'], inplace=True)

# convert to matrix format
# levels (0,1,2) are (virus, serum, nht)
titer_matrix = titer_table.stack().unstack(['serum'], fill_value=0)

# replace 0 values with "*"
titer_matrix.replace(0, "*", inplace=True)

# to remove titre from index
# reset index, then remove column level 1
titer_matrix = titer_matrix.reset_index().drop(columns=['level_1']).set_index(['virus'])
titer_matrix.to_csv(matrix_fn)

## Get 2D coordinates of virus isolates
Run R code in filename "Fig3a_part2_virusCoords_racmacs.R" to get the 2D coordinates (antigenic cartography) for virus isolates using [racmacs](https://acorg.github.io/Racmacs/)

## Then run file "Fig3a_part3_seasonal_antigenic_cartography"