# Serum escape at key amino acids for children from 2010 and 2020 cohorts
Visualizing amino acid level escape scores at sites targeted by child sera against either Perth/2009 or HK/19. We focus on amino acids that were circulating at high frequency between 2010 and 2020. Results shown in Figure 5C.

In [1]:
import altair as alt
import altair_saver

import pandas as pd

import polyclonal

import warnings
warnings.filterwarnings('ignore')

import numpy as np

from IPython.utils import io

In [2]:
import os
os.chdir('../../')

## Import escape data and format

In [3]:
# define samples in pediatric cohort
ped_hk19 = [3944, 2389, 2323, 2388, 3973, 4299, 4584, 2367]

# get escape dfs
escape_dfs = []

for serum in ped_hk19:
    prob_escape = pd.read_csv(
        f'results/antibody_escape/{serum}_avg.csv'
    ).query(
        "`times_seen` >= 5"
    )

    prob_escape = prob_escape[['site', 'wildtype', 'mutant', 'escape_mean']]
    prob_escape = prob_escape.rename(columns={'escape_mean': 'escape'})
        
    prob_escape['serum'] = serum
    prob_escape['serum'] = prob_escape['serum'].astype(str)
    prob_escape['cohort'] = '2-5 years'
    prob_escape['library'] = 'hk19'
    
    escape_dfs.append(prob_escape)
    
hk19_df = pd.concat(escape_dfs)

hk19_df.head()

Unnamed: 0,site,wildtype,mutant,escape,serum,cohort,library
12,-2,D,Y,0.0338,3944,2-5 years,hk19
19,1,Q,R,-0.0235,3944,2-5 years,hk19
22,2,K,N,-0.0178,3944,2-5 years,hk19
24,3,I,A,0.0821,3944,2-5 years,hk19
25,3,I,D,0.0703,3944,2-5 years,hk19


In [4]:
# define samples in each age cohort
sample_dict = {
    "2-4 years": [
        "age 2.1 (Vietnam)", 
        "age 2.2 (Vietnam)",
        "age 2.4 (Vietnam)",
        "age 2.5 (Vietnam)",
        "age 2.5b (Vietnam)",
        "age 3.3 (Vietnam)", 
        "age 3.3b (Vietnam)",
        "age 3.4 (Vietnam)", 
        "age 3.5 (Vietnam)",
    ],   
    "30-34 years": [
        "age 30.5 (Vietnam)",
        "age 31.5 (Vietnam)",
        "age 33.5 (Vietnam)",
    ],
    "misc_adult": [
        "age 21 (Seattle)",
        "age 53 (Seattle)",
        "age 64 (Seattle)",
        "age 65 (Seattle)",
    ],
    "ferret": [
        "ferret 1 (Pitt)",
        "ferret 2 (Pitt)",
        "ferret 3 (Pitt)",
        "ferret (WHO)",
    ]
}

# get full dataset
perth09_df = pd.read_csv(f'results/perth2009/merged_escape.csv')[['name', 'site', 'wildtype', 'mutant', 'escape']]
perth09_df = perth09_df.rename(columns={'name': 'serum'})

# Function to convert '(HA2)X' to numeric
def convert_site_to_numeric(site):
    if '(HA2)' in site:
        try:
            number = int(site.replace('(HA2)', '').strip())
            return number + 329
        except ValueError:
            return site  # If there's an issue with conversion, return the original value
    else:
        return site

# Apply the function to the 'site' column
perth09_df['site'] = perth09_df['site'].apply(convert_site_to_numeric)

# floor escape at 0
perth09_df['escape'] = perth09_df['escape'].clip(lower=0)

# add cohort label
def find_sample_type(sample_name):
    for sample_type, sample_list in sample_dict.items():
        if sample_name in sample_list:
            return sample_type
    return None

perth09_df['cohort'] = perth09_df['serum'].apply(find_sample_type)
perth09_df['library'] = 'perth09'

# filter to just kids
perth09_df['site'] = perth09_df['site'].astype(int)
perth09_df = perth09_df.loc[(perth09_df['cohort'] == '2-4 years')]

perth09_df.head()

Unnamed: 0,serum,site,wildtype,mutant,escape,cohort,library
33960,age 2.1 (Vietnam),193,F,A,2.908,2-4 years,perth09
33961,age 2.1 (Vietnam),193,F,D,1.402,2-4 years,perth09
33962,age 2.1 (Vietnam),193,F,S,1.36,2-4 years,perth09
33963,age 2.1 (Vietnam),193,F,E,0.6624,2-4 years,perth09
33964,age 2.1 (Vietnam),193,F,G,0.3975,2-4 years,perth09


In [5]:
df = pd.concat([hk19_df, perth09_df])

## Generate AA-level escape plots
Note that we normalize the Perth/09 escape scores to a maximum value of 1, in order to facilitate comparison to HK/19 escape scores.

In [6]:
colors=['#7671B3', '#D86327']

def make_aa_plot(df, site, aa_list=None):
    df = df.loc[df['site'] == site]

    # add wildtype entry with escape=0 for each HK19 site -
    # Filter rows with library='hk19'
    df_hk19 = df[df['library'] == 'hk19']
    
    # Get unique sites in hk19
    unique_sites = df_hk19['site'].unique()
    
    # Create a DataFrame to add as additional rows
    additional_rows = []
    for site in unique_sites:
        subset = df_hk19[df_hk19['site'] == site]
        row = {
            'serum': subset['serum'].values[0],
            'site': site,
            'wildtype': subset['wildtype'].values[0],
            'mutant': subset['wildtype'].values[0],  # Mutant is the same as wildtype
            'escape': 0,  # Escape is set to 0
            'cohort': subset['cohort'].values[0],
            'library': 'hk19'
        }
        additional_rows.append(row)

    # Create a DataFrame from the additional rows
    additional_df = pd.DataFrame(additional_rows)
    
    # Concatenate the original DataFrame and the additional rows DataFrame
    wt_df = pd.concat([df, additional_df], ignore_index=True)

    # Normalize just the Perth09 values to 1 (approximate range of HK19 values)
    max_perth09_escape = wt_df[wt_df['library'] == 'perth09']['escape'].max()
    
    normalized_df = wt_df.copy()
    
    normalized_df.loc[normalized_df['library'] == 'perth09', 'escape'] = (normalized_df.loc[
                                                                          (normalized_df['library'] == 'perth09', 'escape')
                                                                          ] / max_perth09_escape
                                                                         )
    
    if aa_list:
        normalized_df = normalized_df.loc[normalized_df['mutant'].isin(aa_list)]

    # Extract unique 'mutant' categories
    # unique_mutants = normalized_df['mutant'].unique()

    # Create a dictionary to map 'mutant' categories to numeric values
    mutant_to_numeric = {mutant: i for i, mutant in enumerate(aa_list)}

    # Add a temporary numeric column for 'mutant'
    normalized_df['numeric_mutant'] = normalized_df['mutant'].map(mutant_to_numeric)

    # Define your desired standard deviation for jitter
    std_deviation = 0.05  # Adjust this value as needed

    # Add jitter to the 'numeric_mutant' column
    normalized_df['jittered_numeric_mutant'] = normalized_df['numeric_mutant'] + np.random.normal(0, std_deviation, size=len(normalized_df))



    # Plot resulting escapes
    if len(aa_list)>1:
        x_domain = [-0.6, 1.6]
    else:
        x_domain = [-1, 1]
        
    site_plot = (
        alt.Chart()
        .mark_circle(size=120, opacity=0.6)
        .encode(
            x=alt.X("jittered_numeric_mutant:Q",  # Use the jittered numeric column for x-axis
                    title="mutant",
                    axis=alt.Axis(values=list(mutant_to_numeric.values()),  # Set axis values to numeric values
                                  # labelExpr="datum.value",  # Use custom labels based on 'mutant' categories
                                  tickCount=len(aa_list),
                                  labelAngle=0),
                    scale=alt.Scale(domain=x_domain, clamp=True),
                   ),
            y=alt.Y(
                "escape",
                title="escape score",
                scale=alt.Scale(domain=[-1.1, 1.1], clamp=True),
                axis=alt.Axis(values=[-1, -0.5, 0, 0.5, 1])
            ),
            color=alt.Color('library:N', 
                            legend=None,
                           ),
            detail=alt.Detail(['serum']),
            tooltip=['serum', 'library', 'escape']
        )
        .properties(width=80, height=200)
    )
    
    x_axis = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(
        size=2, 
        opacity=0.5, color='gray').encode(y='y')
    
    overlay = alt.layer(
        site_plot, x_axis, data=normalized_df
    ).facet(
            facet=alt.Facet(
                'library:N',
                sort=['perth09', 'hk19']
            ),
            spacing=2,
            columns=2,
        
    ).configure_axis(
            grid=False,
            labelFontSize=14,
            titleFontSize=15
    # )
    ).configure_range(category=alt.RangeScheme(colors))

    return overlay

### Define amino acids to plot for each cohort at each site
These will be combined to generate the plots in Figure 5C. These are plotted separately so that we can exclude plotting the wildtype amino acid for each respective library.

In [7]:
aa_targets_perth09 = {
    159: ['Y', 'S'],
    160: ['T'],
    193: ['S', 'D'],
    50: ['K']
}

aa_targets_hk19 = {
    159: ['F', 'S'],
    160: ['K'],
    193: ['F', 'D'],
    50: ['K']
}

### Save plots for relevant AAs for each library
Because of issues with manually jittering the points, I had to assign each AA to a numeric x-value (0, 1, etc). So those are the labels on the plots. To avoid confusion, plots are saved with the AAs for that library **in order** in the plot title. These were then adjusted in Illustrator.

In [8]:
for site, aa_list in aa_targets_perth09.items():
    aa_plot = make_aa_plot(df, 
                           site=site, 
                           aa_list=aa_targets_perth09[site])
    
    aa_plot.save(f'figures/sitewise_escape/figure_5/aa_plots/site_{site}-perth09.png',
                 scale_factor=2.0)
    
    
for site, aa_list in aa_targets_hk19.items():
    aa_plot = make_aa_plot(df, 
                           site=site, 
                           aa_list=aa_targets_hk19[site])
    
    aa_list = aa_targets_hk19[site]
    if len(aa_list)==2:
        aa_plot.save(f'figures/sitewise_escape/figure_5/aa_plots/site_{site}-hk19_{aa_list[0]}_{aa_list[1]}.png',
                     scale_factor=2.0)
        
    if len(aa_list)==1:
        aa_plot.save(f'figures/sitewise_escape/figure_5/aa_plots/site_{site}-hk19_{aa_list[0]}.png',
                     scale_factor=2.0)