# Convert flat HI titer table to matrix form

## Imports

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

## Paths and filenames

In [2]:
# paths
path_data   = "../data/"   # path of data

# filenames
titer_fn  = path_data + "flat_HI_table.csv"   # HI titer table
nht_fn    = path_data + "nhts_ha1.csv"   # NHT table
matrix_fn = path_data + "titer_matrix.csv"   # to save HI titer matrix

## Read and preprocess HI titer table

In [3]:
'''
Read HI titers table
'''
titer_table = pd.read_csv(titer_fn)

# get relevant fields
titer_table = titer_table[["virusName", "virusPassage", "virusDate",
                           "serumName", "serumPassage", "serumDate",
                           "titer"]]


'''
labels virus and serum
'''
# as the fields corresponding to passage could include nan values
# so, convert them to string 'nan'
titer_table.virusPassage.replace(np.nan, 'nan', inplace=True)
titer_table.serumPassage.replace(np.nan, 'nan', inplace=True)

# combine metadata fields with a separator '=' and get relevant field columns
titer_table['virus'] = titer_table.virusName + '=' + titer_table.virusPassage
titer_table['serum'] = titer_table.serumName + '=' + titer_table.serumPassage

## Read NHT table and find unique isolates

In [4]:
nht_table = pd.read_csv(nht_fn)

viruses  = nht_table.virus.unique()
sera     = nht_table.serum.unique()
isolates = pd.DataFrame(np.concatenate((viruses, sera)), columns=["isolates"])
isolates = isolates.drop_duplicates(["isolates"])

## Keep titers that match unique isolates in NHT table

In [5]:
titer_table = titer_table[titer_table.virus.isin(isolates.isolates) & titer_table.serum.isin(isolates.isolates)]

## Assign seasons and keep data from 2003NH to 2020SH

In [6]:
for ind, row in titer_table.iterrows():
    if row.virusDate[5:] <= "01-31":
        titer_table.loc[ind, "season"] = row.virusDate[:4] + "NH"
    
    elif (row.virusDate[5:] >= "02-01") and (row.virusDate[5:] <= "08-31"):
        titer_table.loc[ind, "season"] = row.virusDate[:4] + "SH"
    
    elif (row.virusDate[5:] >= "09-01") and (row.virusDate[5:] <= "12-31"):
        season_year = int(row.virusDate[:4])
        titer_table.loc[ind, "season"] = str(season_year+1) + "NH"


'''
keep data from 2003NH to 2020SH
'''
seasons = [str(year)+s for year in range(2003, 2021) for s in ["NH", "SH"]]

titer_table = titer_table[titer_table.season.isin(seasons)]

## Combine metadata fields with a separator '=' and get column fields of virus, serum, and titer

In [7]:
titer_table['virus'] = titer_table.virusDate + '=' + titer_table.virusName + '=' + titer_table.virusPassage
titer_table['serum'] = titer_table.serumDate + '=' + titer_table.serumName + '=' + titer_table.serumPassage

titer_table = titer_table[["virus", "serum", "titer"]]

## Mean titer value of duplicate virus-antiserum pairs (repeated HI assays)

In [8]:
titer_table["titer"] = np.log2(titer_table.titer)
titer_table = titer_table.groupby(["virus", "serum"], as_index=False).mean()
titer_table["titer"] = np.power(2, titer_table.titer)

## Convert to matrix form

In [9]:
# set virus and serum as index
titer_table.set_index(['virus', 'serum'], inplace=True)

# convert to matrix format
# levels (0,1,2) are (virus, serum, nht)
titer_matrix = titer_table.stack().unstack(['serum'], fill_value=0)

# replace 0 values with "*"
titer_matrix.replace(0, "*", inplace=True)

# to remove nht from index
# reset index, then remove column level 1
titer_matrix = titer_matrix.reset_index().drop(columns=['level_1']).set_index(['virus'])
titer_matrix.to_csv(matrix_fn)

## Get 2D coordinates of virus isolates
Run R code in filename "Fig3a_part2_virusCoords_racmacs.R" to get the 2D coordinates (antigenic cartography) for virus isolates using [racmacs](https://acorg.github.io/Racmacs/)

## Then run file "Fig3a_part3_seasonal_antigenic_cartography"