# Correlations between DMS escape scores and validation neut assays for A/Hong Kong/45/2019
In this notebook, we generate plots showing the correlation between DMS escape scores for selected mutations, and fold-change-IC50s from neutralization assays validating those mutations. These validations included 9 different mutations and 7 sera. Visualization of the neutralization curves and calculation of fold-change-IC50 values can be found in `../../neut_assays/validations_hk19`. 

Final plots include a single correlation plot for all analyzed sera (`figure_3`) and correlation plots for each individual serum (`figure_S7` for full set, `figure_1` for 3944 + 197C only). These plots include calculations of the Pearson R correlation coefficient.

In [1]:
import altair as alt

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import altair_saver
import scipy as sp

import numpy as np

import os

In [2]:
os.chdir('../../')

### Read in IC50-fold-change data for validated mutations
See `neut_assays/validations_hk19` for code generating these IC50-fold-change measurements from neutralization curves.

In [3]:
ics = pd.read_csv('neut_assays/validations_hk19/ic50_fold_changes.csv')

# ignore double mutant validation, as we can't correlate with DMS data
ics = ics.loc[(ics['variant'] != 'T160K-Y159N')]

# make all sera a consistent datatype
ics['serum'] = ics['serum'].astype(str)

### Merge with DMS escape scores for each mutation

In [4]:
# get list of validated sera
sera = ics['serum'].unique().tolist()

# read in avg escape df for each serum
models_list = []
for serum in sera:
    df = pd.read_csv(f'results/antibody_escape/{serum}_avg.csv')
    df['serum'] = serum
    models_list.append(df)

models = pd.concat(models_list)

# make a 'variant' column to match ICs df
models['variant'] = (
    models["wildtype"] + 
    models["site"].astype(str) + 
    models["mutant"]
)

# remove extraneous columns
models = models[['variant', 'serum', 'escape_mean']]

# Merge model predictions with measured ICs
corr_df = (
    ics.merge(
        models,
        how="left",
        on=["variant", "serum"],
        validate="one_to_one",
    )
    .fillna(0)
)

corr_df = corr_df.rename(columns={'escape_mean': 'escape'})

Add additional data to `corr_df`:
* A WT entry for each serum with log2 fold change of 0 and escape of 0
* Age cohort for each serum
* Pearson R correlation coefficient for each sera

In [5]:
# generate empty dataframe for WT entries with columns matching corr_df
wt_df = pd.DataFrame(columns=['serum', 'variant', 'log2_fold_change_ic50', 'escape'])

# Iterate over unique serum values
for serum_value in corr_df['serum'].unique():
    
    # Create a 'WT' row for the current serum value
    wt_row = pd.DataFrame({'serum': [serum_value], 
                           'variant': ['WT'], 
                           'log2_fold_change_ic50': [0], 
                           'escape': [0]})

    # Append the 'WT' row to the result DataFrame
    wt_df = pd.concat([wt_df, wt_row], ignore_index=True)
    
# Append the 'WT' rows to the original DataFrame
corr_df = pd.concat([corr_df, wt_df], ignore_index=True)

# manually define cohorts
cohort_dict = {
    '3944': '2-5 years',
    '4584': '2-5 years',
    '2365': '15-20 years',
    '3856': '15-20 years',
    '3857': '15-20 years',
    '74C': '40-45 years',
    '197C': '40-45 years', 
}

corr_df['cohort'] = corr_df['serum'].map(cohort_dict)

# add Pearson R values to df
r_dict = {}
sera = corr_df['serum'].unique().tolist()
ic_col = [col for col in corr_df.columns if 'ic' in col][0]

for serum in sera:
    serum_df = corr_df.loc[corr_df['serum'] == serum]
    x = serum_df[ic_col]
    y = serum_df["escape"]
    r, p = sp.stats.pearsonr(x, y)
    r = round(r, 3)
    
    r_dict[serum] = r

corr_df['r'] = corr_df['serum'].map(r_dict)

corr_df.head()

Unnamed: 0,serum,variant,log2_fold_change_ic50,escape,cohort,r
0,2365,Y159N,1.15878,0.3106,15-20 years,0.987
1,2365,T160K,0.08919,0.0047,15-20 years,0.987
2,2365,K189E,2.199497,0.781,15-20 years,0.987
3,2365,S193D,1.856389,0.5509,15-20 years,0.987
4,2365,S145H,-0.300784,-0.0419,15-20 years,0.987


### Plot correlation for all sera
All sera are overlaid on a single plot, and the Pearson R value is calculated for the complete set of sera + mutations as a whole.

In [6]:
# get ic column name
ic_col = [col for col in corr_df.columns if 'ic' in col][0]

# Calculate Pearson correlation between predicted and measured
x = corr_df[ic_col]
y = corr_df["escape"]
r, p = sp.stats.pearsonr(x, y)

r = round(r, 3)

# Define the order for 'variant' and 'serum'
variant_order = ['WT', 'E50K', 'S145H', 'S145K', 'Y159N', 'T160K', 'T160S', 'K189E', 'S193D', 'S193Y']
serum_order = ['3944', '74C', '2365', '3856', '3857', '4584', '197C']

# Define cb-friendly color scheme
custom_color_scheme = ['#333333', '#CC6677', '#999933', '#DDCC77', '#117733', 
                   '#882255', '#88CCEE', '#44AA99', '#332288', '#AA4499']

# Get plot
corrs = (
    alt.Chart(corr_df)
    .mark_point(filled=True, size=200)
    .encode(
        x=alt.X(ic_col,
                title=ic_col,
                scale=alt.Scale(
                    domain=[-4, 4],
                )
               ),
        y=alt.Y('escape',
                title='escape score',
                scale=alt.Scale(
                    domain=[-1, 1],
                    clamp=True
                )
               ),
        color=alt.Color('variant:N', scale=alt.Scale(range=custom_color_scheme, domain=variant_order)), 
        shape=alt.Shape('serum:N', scale=alt.Scale(domain=serum_order)),
        tooltip=['variant', 'serum', ic_col, 'escape']
    )

)

# Get R value to add to plot
r_text = alt.Chart().mark_text(
    align='left', 
    baseline='bottom',
    fontSize=14,
    fontWeight=300
).encode(
    x=alt.value(10),
    y=alt.value(20),
    text=alt.value([f"R={r}"])
)

# Add defined x and y axes
x_axis = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(
    size=1, 
    opacity=0.5, color='gray').encode(y='y')

y_axis = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(
    size=1, 
    opacity=0.5, color='gray').encode(x='x')

# plot final layered chart
chart = (
    (corrs + r_text + x_axis + y_axis)
    .configure_axis(
        grid=False,
        labelFontSize=14,
        titleFontSize=15,  
    )
    .configure_title(
        fontSize=21,
        fontWeight='normal',
    )
)

chart.save(
    'figures/validation_correlations/figure_3/hk19_full-serum-corr_ic50.png',
    scale_factor=2.0
)

chart

### Plot correlation for each independent serum

In [7]:
# define ordering for plot
variant_order = ['WT', 'E50K', 'S145H', 'S145K', 'Y159N', 'T160K', 'T160S', 'K189E', 'S193D', 'S193Y']
serum_order = ['3944', '4584', '2365', '3856', '3857', '74C', '197C']
cohort_order = ['2-5 years', '15-20 years', '40-45 years']

# re-define `corrs` chart to adjust plot sizing for facet
corrs = (
    alt.Chart()
    .mark_point(filled=True, size=100)
    .encode(
        x=alt.X(ic_col, 
                title=ic_col,
                scale=alt.Scale(
                        domain=[-4, 4],
                ),
               ),
        y=alt.Y('escape', 
                title='escape score',
                scale=alt.Scale(
                    domain=[-1, 1],
                    clamp=True
                )
               ),
        color=alt.Color('variant:N', scale=alt.Scale(range=custom_color_scheme, domain=variant_order)),
        shape=alt.Shape('cohort:N', scale=alt.Scale(domain=cohort_order)),
        tooltip=['variant', 'serum', ic_col, 'escape']
    )
    .properties(
        title=serum,
        width=120,
        height=120
    )
)

# re-define `r-text` using the R value defined for each serum in corr_df
# and adjust font size and positioning for faceted chart
r_text = alt.Chart().mark_text(
    align='right',
    baseline='bottom',
    fontSize=12,
    fontWeight=300
).encode(
    x=alt.value(115),
    y=alt.value(115),
    text='r:N'
)

# plot faceted chart
faceted_corrs = alt.layer(
    corrs, r_text, x_axis, y_axis, data=corr_df, 
).facet(
    facet=alt.Facet(
        'serum:N',
        sort=serum_order,
        header=alt.Header(
            titleFontSize=20,
            labelFontSize=14,
            labelPadding=2
    )
    ), 
    spacing=3,
    columns=4
).configure_axis(
    grid=False,
    labelFontSize=14,
    titleFontSize=10,
)

faceted_corrs.save(
    'figures/validation_correlations/figure_S7/hk19_faceted-corrs_ic50.svg',
    scale_factor=2.0
)

faceted_corrs

Repeat to generate 3944 + 197C correlation plots for Figure 1

In [8]:
# remove neutral S193Y to minimize curves shown in Fig 1
corr_df_fig1 = corr_df[(corr_df['serum'].isin(['3944', '197C'])) &
                       (corr_df['variant'] != 'S193Y')] 

# re-calculate R values without S193Y
r_dict = {}
sera = corr_df['serum'].unique().tolist()
ic_col = [col for col in corr_df.columns if 'ic' in col][0]

for serum in sera:
    serum_df = corr_df.loc[corr_df['serum'] == serum]
    x = serum_df[ic_col]
    y = serum_df["escape"]
    r, p = sp.stats.pearsonr(x, y)
    r = round(r, 3)
    
    r_dict[serum] = r

corr_df['r'] = corr_df['serum'].map(r_dict)

# generate faceted plot
corrs_3944_197C = alt.layer(
    corrs, r_text, x_axis, y_axis, data=corr_df_fig1, 
).facet(
    facet=alt.Facet(
        'serum:N',
        sort=serum_order,
        header=alt.Header(
            titleFontSize=20,
            labelFontSize=14,
            labelPadding=2
    )
    ), 
    spacing=3,
    columns=4
).configure_axis(
    grid=False,
    labelFontSize=14,
    titleFontSize=10,
)

corrs_3944_197C.save(
    'figures/validation_correlations/figure_1/hk19_corrs_3944_197C.svg',
    scale_factor=2.0
)

corrs_3944_197C