# Correlations between DMS escape scores and validation neut assays for A/Perth/16/2009
In this notebook, we generate plots showing the correlation between DMS escape scores for selected mutations, and fold-change-IC50s from neutralization assays validating those mutations. These validations included 5 different mutations and 13 sera. Visualization of the neutralization curves and calculation of fold-change-IC50 values can be found in `../../neut_assays/validations_perth09`. 

Because the A/Perth/16/2009 experiments did not include an external neutralization standard, the magnitude of escape scores varies between samples, and we cannot visualize all sera on a single correlation plot. 

Correlation plots for each individual serum are saved in `figure_4`. These plots include calculations of the Pearson R correlation coefficient. 

In [1]:
import altair as alt

import pandas as pd

import polyclonal

import warnings
warnings.filterwarnings('ignore')

import altair_saver
import scipy as sp

import numpy as np

In [2]:
import os
os.chdir('../../')

### Read in IC50 data and DMS escape data for Perth/09 analysis

In [3]:
ics = pd.read_csv('neut_assays/validations_perth09/ic50_fold_changes.csv')
ics['serum'] = ics['serum'].astype(str)

In [4]:
# get full dataset
escape_df = pd.read_csv(f'results/perth2009/merged_escape.csv')[['name', 'site', 'wildtype', 'mutant', 'escape']]
escape_df = escape_df.rename(columns={'name': 'serum'})

# set up dict for renaming validated samples
sample_rename_dict = {
    "age 2.1 (Vietnam)": "age-2.1", 
    "age 2.2 (Vietnam)": 'age-2.2',
    "age 2.4 (Vietnam)": 'age-2.4',
    "age 2.5 (Vietnam)": 'age-2.5',
    "age 2.5b (Vietnam)": 'age-2.5-b',
    "age 3.3 (Vietnam)": 'age-3.3',
    "age 3.3b (Vietnam)": 'age-3.3-b',
    "age 3.4 (Vietnam)": 'age-3.4',
    "age 3.5 (Vietnam)": 'age-3.5',
    "age 30.5 (Vietnam)": 'age-30.5',
    "age 31.5 (Vietnam)": 'age-31.5',
    "age 33.5 (Vietnam)": 'age-33.5',
    "ferret 2 (Pitt)": 'ferret-Pitt2',
}

# rename sera and drop extra samples
escape_df['serum'] = escape_df['serum'].replace(sample_rename_dict)
escape_df = escape_df.dropna(subset=['serum'])

# set up dict for adding cohort label
sample_dict = {
    "2-4 years": [
        'age-2.1', 
        'age-2.2',
        'age-2.4',
        'age-2.5',
        'age-2.5-b',
        'age-3.3',
        'age-3.3-b',
        'age-3.4',
        'age-3.5',
    ],   
    "30-33 years": [
        'age-30.5',
        'age-31.5',
        'age-33.5',
    ],
    "ferret": [
        'ferret-Pitt2',
    ]
}

# add cohort label
def find_sample_type(sample_name):
    for sample_type, sample_list in sample_dict.items():
        if sample_name in sample_list:
            return sample_type
    return None

escape_df['cohort'] = escape_df['serum'].apply(find_sample_type)

# Function to convert '(HA2)X' to numeric
def convert_site_to_numeric(site):
    if '(HA2)' in site:
        try:
            number = int(site.replace('(HA2)', '').strip())
            return number + 329
        except ValueError:
            return site  # If there's an issue with conversion, return the original value
    else:
        return site

# Apply the function to the 'site' column
escape_df['site'] = escape_df['site'].apply(convert_site_to_numeric)

# floor at 0
escape_df['escape'] = escape_df['escape'].clip(lower=0)

# add 'variant' column and remove extraneous columns
escape_df['variant'] = (
    escape_df['wildtype'] +
    escape_df['site'].astype(str) + 
    escape_df['mutant']
)

escape_df = escape_df[['variant', 'serum', 'escape', 'cohort']]

escape_df.head()

Unnamed: 0,variant,serum,escape,cohort
0,F159G,age-30.5,3.482,30-33 years
1,F159N,age-30.5,2.451,30-33 years
2,F159H,age-30.5,2.334,30-33 years
3,F159T,age-30.5,1.514,30-33 years
4,F159S,age-30.5,1.195,30-33 years


### Merge IC50 and DMS escape data for validated mutations
Also add WT entry for each serum, and calculate Pearson R coefficient

In [5]:
# Merge model predictions with measured ICs
corr_df = (
    ics.merge(
        escape_df,
        how="left",
        on=["variant", "serum"],
        validate="one_to_one",
    )
    .fillna(0)
)

# generate empty dataframe for WT entries with columns matching corr_df
wt_df = pd.DataFrame(columns=['serum', 'variant', 'log2_fold_change_ic50', 'escape'])

# Iterate over unique serum values
for serum_value in corr_df['serum'].unique():
    
    # Create a 'WT' row for the current serum value
    wt_row = pd.DataFrame({'serum': [serum_value], 
                           'variant': ['WT'], 
                           'log2_fold_change_ic50': [0], 
                           'escape': [0]})

    # Append the 'WT' row to the result DataFrame
    wt_df = pd.concat([wt_df, wt_row], ignore_index=True)
    
# add cohort labels to WT df
wt_df['cohort'] = wt_df['serum'].apply(find_sample_type)
    
# Append the 'WT' rows to the original DataFrame
corr_df = pd.concat([corr_df, wt_df], ignore_index=True)

# add r values to df
r_dict = {}
sera = corr_df['serum'].unique().tolist()
ic_col = [col for col in corr_df.columns if 'ic' in col][0]

for serum in sera:
    serum_df = corr_df.loc[corr_df['serum'] == serum]
    x = serum_df[ic_col]
    y = serum_df["escape"]
    r, p = sp.stats.pearsonr(x, y)
    r = round(r, 3)
    
    r_dict[serum] = r

corr_df['r'] = corr_df['serum'].map(r_dict)
corr_df.head()

Unnamed: 0,serum,variant,log2_fold_change_ic50,escape,cohort,r
0,age-2.1,F193F,-0.100906,0.0,2-4 years,0.789
1,age-2.1,K189D,0.898353,2.336,2-4 years,0.789
2,age-2.1,F193D,0.606342,1.402,2-4 years,0.789
3,age-2.1,F159G,0.653439,0.1604,2-4 years,0.789
4,age-2.2,F193F,-0.038777,0.0,2-4 years,0.77


In [6]:
# get ic column name
ic_col = [col for col in corr_df.columns if 'ic' in col][0]

# Calculate correlation between predicted and measured
x = corr_df[ic_col]
y = corr_df["escape"]

# Calculate Pearson correlation coefficient
r, p = sp.stats.pearsonr(x, y)

r = round(r, 3)

# Define the order for 'variant' and 'serum'
variant_order = ['WT', 'F159G', 'K189D', 'I192E', 'F193D', 'F193F']

# Define cb-friendly color scheme
custom_color_scheme = ['#333333', '#CC6677', '#332288', '#117733', 
                   '#88CCEE', '#882255', '#44AA99', '#DDCC77', '#999933', '#AA4499']

# Get plot
corrs = (
    alt.Chart(corr_df)
    .mark_point(filled=True, size=200)
    .encode(
        x=alt.X(ic_col,
                title=ic_col,
                scale=alt.Scale(
                    # domain=[-4, 4],
                    # type="symlog"
                )
               ),
        y=alt.Y('escape',
                title='DMS escape score',
               ),
        color=alt.Color('variant:N', scale=alt.Scale(range=custom_color_scheme, domain=variant_order)), 
        shape=alt.Shape('cohort:N'),
        tooltip=['variant', 'serum', ic_col, 'escape']
    )
)

r_text = alt.Chart().mark_text(
    align='left', 
    baseline='bottom',
    fontSize=14,
    fontWeight=400
).encode(
    x=alt.value(10),
    y=alt.value(20),
    text=alt.value([f"R={r}"])
)


chart = (
    (corrs + r_text)
    .configure_axis(
        grid=False,
        labelFontSize=14,
        titleFontSize=15,  
    )
    .configure_title(
        fontSize=21,
        fontWeight='normal',
        # padding=5,"
    )
)

chart

Because these experiments did not use an external neutralization standard, the magnitude of escape varies between experiments. This is likely why correlation is relatively low for these validations - they cannot be analyzed altogether on a single scale. We therefore chose not to include this full summary plot.

### Plot correlations for each independent serum

In [7]:
corrs = (
    alt.Chart()
    .mark_point(filled=True, size=100)
    .encode(
        x=alt.X(ic_col, title=ic_col),
        y=alt.Y('escape', title='DMS escape score'),
        color=alt.Color('variant:N', scale=alt.Scale(range=custom_color_scheme, domain=variant_order)),
        shape=alt.Shape('cohort:N'),
        tooltip=['variant', 'serum', ic_col, 'escape']
    )
    .properties(
        title=serum,
        width=100,
        height=100
    )
)

r_text = alt.Chart().mark_text(
    align='right',
    baseline='bottom',
    fontSize=12,
    fontWeight=300
).encode(
    x=alt.value(95),
    y=alt.value(95),
    text='r:N'
)



# faceted_corrs = alt.layer(
#     corrs, data=corr_df
# ).facet(
#     facet='serum:N',
#     columns=4,
#     title=None
# )


faceted_corrs = alt.layer(
    corrs, r_text, data=corr_df, 
).facet(
    facet=alt.Facet(
        'serum:N',
        header=alt.Header(
            titleFontSize=20,
            labelFontSize=14,
            labelPadding=3
    )
    ),
    spacing=1,
    columns=5
).configure_axis(
    grid=False,
    labelFontSize=14,
    titleFontSize=15,
)

faceted_corrs.save(
    'figures/validation_correlations/figure_4/perth09_faceted-corrs_ic50.png',
    scale_factor=2.0
)

faceted_corrs