# Multidimensional scaling of serum escape profiles across age groups

In [1]:
import altair as alt

import pandas as pd

import itertools

import numpy

import sklearn.manifold

import warnings
warnings.filterwarnings('ignore')

from IPython.utils import io

In [2]:
import os
os.chdir('../../')

### Read beta and IC90 values
For this analysis, I'm using the escape scores from models averaged between libA and libB. We have data on the mean, median, and std deviation for beta and IC90 values between both models. I'm just working with the mean scores for now, and analyzing both beta and IC90 in case these yield different results. We also aggregate sitewise scores as a sum of AA-level escape scores.

In [3]:
# define samples in each age cohort
ped_sera = [2367, 3944, 2462, 2389, 2323, 2388, 2463, 3973, 4299, 4584]
teen_sera = [2343, 2350, 2365, 2380, 2382, 3866, 3856, 3857, 3862, 3895]
adult_sera = ['33C', '34C', '197C', '199C', '215C', '210C', '74C', '68C', '150C', '18C']

# get list of lists for samples divided by age group
serum_lists = [ped_sera, teen_sera, adult_sera]
age_cohorts = ['0-5', '15-18', '40-45']

# adjust this if we want more stringent filtering
min_times_seen = 5

df_list = []

i = 0 # for looping across age cohort definitions

for list in serum_lists:
    for serum in list:
        # reading in values from averaged libA and libB models
        avg_beta_df = pd.read_csv(f'results/antibody_escape/{serum}_avg.csv'
                                 ).query(f"`times_seen` >= {min_times_seen}")
        avg_ic90_df = pd.read_csv(f'results/antibody_escape/{serum}_icXX_avg.csv'
                                 ).query(f"`times_seen` >= {min_times_seen}")

        # get both ic90 and beta in same df
        full_df = avg_beta_df.merge(avg_ic90_df,
                                    how='left',
                                    on=['site', 'wildtype', 'mutant']
                                   )[['site', 'wildtype', 'mutant', 'escape_mean', 
                                      'log2 fold change IC90 mean']]

        full_df = full_df.rename(columns={'log2 fold change IC90 mean': 'ic90_mean',
                                          'escape_mean': 'beta_mean'
                                         })
        serum = str(serum) # ped / teen sera automatically read as ints
        full_df['serum'] = serum
        full_df['age_cohort'] = age_cohorts[i]

        # also get summed site scores to check AA-level vs site-level metrics
        full_df['sitewise_beta'] = full_df['beta_mean'].groupby(full_df['site']).transform('sum')
        full_df['sitewise_ic90'] = full_df['ic90_mean'].groupby(full_df['site']).transform('sum')

        df_list.append(full_df)

    i+=1

# concat to final df
escape_df = pd.concat(df_list).reset_index(drop=True)

escape_df.head()

Unnamed: 0,site,wildtype,mutant,beta_mean,ic90_mean,serum,age_cohort,sitewise_beta,sitewise_ic90
0,-2,D,Y,-0.0881,-0.127,2367,0-5,-0.0881,-0.127
1,1,Q,R,-0.0979,-0.1413,2367,0-5,-0.0979,-0.1413
2,2,K,N,0.0303,0.0437,2367,0-5,0.0303,0.0437
3,3,I,A,0.0382,0.0551,2367,0-5,-0.5305,-0.7658
4,3,I,D,0.065,0.0938,2367,0-5,-0.5305,-0.7658


### Calculate similarities between serum profiles, then convert to dissimilarities
We need to compute the similarity between all pairs of escape profiles in this data frame. Similarity is calculated as the dot product of the escape profiles for each pair of sera, testing both the mutation-level and site-level metrics. Each profile is normalized such that its dot product with itself is one.

The escape metric can also be raised to the *p* power to emphasize sites with large values. For now, the default value is 1.

In [4]:
def escape_similarity(df, escape_metric='ic90', site_or_aa='site', p=1):   
    sera = df['serum'].unique()
    similarities = []
    
    if site_or_aa == 'site':
        metric_column = 'sitewise_' + escape_metric
        df = df[['serum', 'site', metric_column]].drop_duplicates()
        pivoted_df = (
            df.assign(metric=lambda x: x[metric_column]**p)
            .pivot_table(index='site',
                         columns='serum',
                         values='metric',
                         fill_value=0)
            # normalize such that each value is between 0 and 1
            .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
        )
    
    elif site_or_aa == 'aa':
        metric_column = escape_metric + '_mean'
        pivoted_df = (
            df.assign(metric=lambda x: x[metric_column]**p)
            .pivot_table(index=['site', 'mutant'],
                         columns='serum',
                         values='metric',
                         fill_value=0)
            # normalize such that each value is between 0 and 1
            .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
        )
        
    else: 
        raise ValueError("escape metric should be either 'site' or 'aa'")
    
    # calculate dot product for each serum profile against all other sera
    for ser1, ser2 in itertools.product(sera, sera):
        similarity = (
            pivoted_df
            .assign(similarity=lambda x: x[ser1] * x[ser2])
            ['similarity']
        )
        assert similarity.notnull().all()
        similarities.append(similarity.sum())
        
    return pd.DataFrame(numpy
                        .array(similarities)
                        .reshape(len(sera), len(sera)),
                        columns=sera, index=sera)    

Define function to compute dissimilarity $d$ from the similarity $s$. Options are:
* **one_minus:** $d = 1-s$
* **minus_log:** $d = -ln(s)$

In [5]:
def dissimilarity(similarity, method='one_minus'):
    if method == 'one_minus':
        return 1 - similarity
    elif method == 'minus_log':
        return -numpy.log(similarity)
    else:
        raise ValueError(f"invalid `method` {method}")

### Run multidimensional scaling and plot results

Set up a function to compute similarities and dissimilarities, then run MDS [as described here](https://scikit-learn.org/stable/auto_examples/manifold/plot_mds.html#sphx-glr-auto-examples-manifold-plot-mds-py). I'm collapsing these computations and the plotting into a single function for now so that it's easier to test different parameters.

In [6]:
def mds_and_plot(df,
                 escape_metric='ic90', 
                 site_or_aa='site', 
                 p=1,
                 dissimilarity_method='one_minus',
                 mds_random_state=1
                ):
    
    # compute similarities and dissimilarities, and get full list of sera
    similarities = escape_similarity(df, escape_metric, site_or_aa, p)
    dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
    sera = df['serum'].unique()
    
    # use MDS to project dissimilarities into 2D space, and get array of serum profile coordinates
    mds = sklearn.manifold.MDS(n_components=2,
                               metric=True,
                               max_iter=3000,
                               eps=1e-6,
                               random_state=mds_random_state, 
                               dissimilarity='precomputed',
                               n_jobs=1)
    locs = mds.fit_transform(dissimilarities)
    
    # convert to pandas df with serum names
    locs_df = pd.DataFrame({'serum': sera, 'x_coord': locs[:, 0], 'y_coord': locs[:, 1]})

    # get one line per serum from full escape df, for age cohort mapping
    age_cohort_df = df.groupby('serum', group_keys=False).apply(lambda df: df.sample(1))

    # add age cohort column
    locs_df = locs_df.merge(age_cohort_df[['serum', 'age_cohort']],
                            how='left',
                            on='serum', 
                           )
    
    # visualize with altair
    mds_plot = (
        alt.Chart(locs_df)
        .encode(
            x=alt.X("x_coord",
                    scale=alt.Scale(padding=5),
                   ),
            y=alt.Y("y_coord",
                    scale=alt.Scale(padding=5),
                   ),
            tooltip=['serum', 'age_cohort'],
            color=alt.Color('age_cohort:N'
                           ).scale(scheme='set2'),
            detail='serum',
        )
        .mark_circle(size=200, opacity=0.7)
        .configure_axis(
            grid=False,
            title=None,
            labelFontSize=12
        )
        .configure_legend(
            titleFontSize=15,
            labelFontSize=13
        )
    )

    return mds_plot

### ic90 and beta return identical plots

In [7]:
mds_and_plot(escape_df, escape_metric='ic90')

In [8]:
mds_and_plot(escape_df, escape_metric='beta')

There doesn't appear to be a difference between IC90 and beta escape scores for MDS. This is consistent with previous analysis, where I found that beta values were consistently 70% of IC90 values. So for all other tests I'm just moving forward with IC90.

## emphasizing sites with large values helps separate adults from children
Here we see that adults start to separate when p=2 and we're comparing by sites, or when p=3 and we're analyzing AA-level escape.

### Sitewise analysis, increasing p

In [9]:
mds_and_plot(escape_df, site_or_aa='site', p=1, 
             dissimilarity_method='one_minus', mds_random_state=7)

In [10]:
mds_and_plot(escape_df, site_or_aa='site', p=2, 
             dissimilarity_method='one_minus', mds_random_state=7)

In [11]:
mds_and_plot(escape_df, site_or_aa='site', p=3, 
             dissimilarity_method='one_minus', mds_random_state=7)

### AA-level analysis, increasing p

In [12]:
mds_and_plot(escape_df, site_or_aa='aa', p=1, 
             dissimilarity_method='one_minus', mds_random_state=8)

In [13]:
mds_and_plot(escape_df, site_or_aa='aa', p=2, 
             dissimilarity_method='one_minus', mds_random_state=8)

In [14]:
mds_and_plot(escape_df, site_or_aa='aa', p=3, 
             dissimilarity_method='one_minus', mds_random_state=8)

### Compare AA and site with increased p:

In [15]:
mds_and_plot(escape_df, site_or_aa='site', p=2, 
             dissimilarity_method='one_minus', mds_random_state=7)

In [16]:
mds_and_plot(escape_df, site_or_aa='aa', p=3, 
             dissimilarity_method='one_minus', mds_random_state=8)

If we analyze on sites, we start to get 2-3 distinct populations (upping to p=3 or higher makes that upper left set of points separate more obviously into 2 clusters). One population is a mix of teenagers and young children, plus 4/10 adults. The other 6 adults separate out into their own cluster. There are also a few outliers in the young children, which we were expecting based on escape profiles.

For AA-level analysis, it looks more like increasing heterogeneity with age. The young children are mostly clustered together in the lower right, whereas teenagers and adults have a much wider spread of points.

## profiles cluster more tightly with minus_log as dissimilarity method
I'm not clear on how to interpret the difference in results from one_minus vs minus_log, but:

In [17]:
mds_and_plot(escape_df, site_or_aa='site', p=1, 
             dissimilarity_method='minus_log')

In [18]:
mds_and_plot(escape_df, site_or_aa='site', p=2, 
             dissimilarity_method='minus_log')

There's a limit in p magnitude for $d=-ln(s)$, and trying 'p=3' returns `ValueError: Input contains NaN, infinity or a value too large for dtype('float64').`

In [19]:
mds_and_plot(escape_df, site_or_aa='aa', p=1, 
             dissimilarity_method='minus_log')

In [20]:
mds_and_plot(escape_df, site_or_aa='aa', p=2, 
             dissimilarity_method='minus_log')

## scratch code - 

In [21]:
# def escape_similarity_aa(df, metric_column, p=1):
#     """Compute similarity between all pairs of conditions in `df`."""
#     df = df[['serum', 'site', 'mutant', metric_column]].drop_duplicates()
    
#     sera = df['serum'].unique()
#     similarities = []
#     pivoted_df = (
#         df.assign(metric=lambda x: x[metric_column]**p)
#         .pivot_table(index=['site', 'mutant'],
#                      columns='serum',
#                      values='metric',
#                      fill_value=0)
#         # normalize such that each value is between 0 and 1
#         .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
#     )

#     for ser1, ser2 in itertools.product(sera, sera):
#         similarity = (
#             pivoted_df
#             .assign(similarity=lambda x: x[ser1] * x[ser2])
#             ['similarity']
#         )
#         assert similarity.notnull().all()
#         similarities.append(similarity.sum())
        
#     return pd.DataFrame(numpy
#                         .array(similarities)
#                         .reshape(len(sera), len(sera)),
#                         columns=sera, index=sera)

In [22]:
# def escape_similarity_site(df, metric_column, p=1):
#     """Compute similarity between all pairs of conditions in `df`."""
#     df = df[['serum', 'site', metric_column]].drop_duplicates()
    
#     sera = df['serum'].unique()
#     similarities = []
#     pivoted_df = (
#         df.assign(metric=lambda x: x[metric_column]**p)
#         .pivot_table(index='site',
#                      columns='serum',
#                      values='metric',
#                      fill_value=0)
#         # normalize such that each value is between 0 and 1
#         .transform(lambda x: x / numpy.linalg.norm(x, axis=0))
#     )

#     for ser1, ser2 in itertools.product(sera, sera):
#         similarity = (
#             pivoted_df
#             .assign(similarity=lambda x: x[ser1] * x[ser2])
#             ['similarity']
#         )
#         assert similarity.notnull().all()
#         similarities.append(similarity.sum())
        
#     return pd.DataFrame(numpy
#                         .array(similarities)
#                         .reshape(len(sera), len(sera)),
#                         columns=sera, index=sera)

In [23]:
# dissimilarity_method = 'one_minus'

# similarities = escape_similarity_aa(full_df, 'sitewise_beta')
# dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
# sera = full_df['serum'].unique()
# n = len(sera)

# mds = sklearn.manifold.MDS(n_components=2,
#                            metric=True,
#                            max_iter=3000,
#                            eps=1e-6,
#                            random_state=5, 
#                            dissimilarity='precomputed',
#                            n_jobs=1)
# locs = mds.fit_transform(dissimilarities)

In [24]:
# def draw_pie(dist, xpos, ypos, size, ax, colors, alpha, circle_color):
#     """Based on this: https://stackoverflow.com/q/56337732"""
#     # for incremental pie slices
#     cumsum = numpy.cumsum(dist)
#     cumsum = cumsum / cumsum[-1]
#     pie = [0] + cumsum.tolist()

#     assert len(colors) == len(dist)
#     for r1, r2, color in zip(pie[:-1], pie[1:], colors):
#         angles = numpy.linspace(2 * numpy.pi * r1, 2 * numpy.pi * r2)
#         x = [0] + numpy.cos(angles).tolist()
#         y = [0] + numpy.sin(angles).tolist()

#         xy = numpy.column_stack([x, y])

#         ax.scatter([xpos], [ypos], marker=xy, s=size, facecolors=color, alpha=alpha, edgecolors='none')
#         ax.scatter(xpos, ypos, marker='o', s=size, edgecolors=circle_color,
#                    facecolors='none', alpha=alpha)

#     return ax

# color_scheme_df = (full_df
#                    .groupby(['serum', 'age_cohort'], group_keys=False)
#                    .apply(lambda df: df.sample(1))
#                    .sort_index()
#                    [['serum', 'age_cohort']]
#                   )

# color_dict = {
#     '0-5': '#E69F00',
#     '15-18': '#56B4E9',
#     '40-45': '#009E73'
    
# }

# color_scheme_df['serum'] = color_scheme_df['serum'].astype(str)
# color_scheme_df['color'] = color_scheme_df['age_cohort'].map(color_dict)

# dists = [[1] for serum in sera]
# serum_to_color = color_scheme_df[['serum', 'color']].set_index('serum')['color'].to_dict()
# colors = [[serum_to_color[serum]] for serum in sera]

# default_circle_color = 'none'
# default_label_color = 'black'

# circle_colors = []

# for serum in sera:
#     circle_colors.append(default_circle_color)
    
# # plot the multidimensional scaling result
# plot_size = 4
# fig, ax = plt.subplots(figsize=(plot_size, plot_size))
# xs = locs[:, 0]
# ys = locs[:, 1]
# for x, y, dist, color, circle_color in zip(xs, ys, dists, colors, circle_colors):
#     draw_pie(dist, x, y,
#              size=100,
#              ax=ax,
#              colors=color,
#              alpha=0.7,
#              circle_color=circle_color,
#              )
# ax.set_aspect('equal', adjustable='box')  # same distance on both axes
# ax.set_xticks([])  # no x-ticks
# ax.set_yticks([])  # no y-ticks
# ax.margins(0.09)  # increase padding from axes

# plt.show(fig)
# plt.close(fig)

In [25]:
# dissimilarity_method = 'one_minus'
# # dissimilarity_method = 'minus_log'

# # similarities = escape_similarity_site(full_df, 'sitewise_ic90', p=2)
# similarities = escape_similarity_aa(full_df, 'ic90_mean', p=3)
# dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
# sera = full_df['serum'].unique()
# n = len(sera)

# mds = sklearn.manifold.MDS(n_components=2,
#                            metric=True,
#                            max_iter=3000,
#                            eps=1e-6,
#                            random_state=12, # this is variable between samples in example, follow up
#                            dissimilarity='precomputed',
#                            n_jobs=1)
# locs = mds.fit_transform(dissimilarities)

In [26]:
# def mds_and_plot(df,
#                  escape_metric='ic90', 
#                  site_or_aa='site', 
#                  p=1,
#                  dissimilarity_method='one_minus',
#                  mds_random_state=1
#                 ):
    
#     # compute similarities and dissimilarities, and get full list of sera
#     similarities = escape_similarity(df, escape_metric, site_or_aa, p)
#     dissimilarities = similarities.applymap(lambda x: dissimilarity(x, method=dissimilarity_method))
#     sera = df['serum'].unique()
    
#     # use MDS to project dissimilarities into 2D space, and get array of serum profile coordinates
#     mds = sklearn.manifold.MDS(n_components=2,
#                                metric=True,
#                                max_iter=3000,
#                                eps=1e-6,
#                                random_state=mds_random_state, 
#                                dissimilarity='precomputed',
#                                n_jobs=1)
#     locs = mds.fit_transform(dissimilarities)
    
    
    
    
    
    
    
#     # following plotting is pulled from RBD MAP notebook, and is overly complex for what I'm doing here
#     # will edit later:
    
#     # assign colors to age groups
#     color_scheme_df = (df
#                        .groupby(['serum', 'age_cohort'], group_keys=False)
#                        .apply(lambda df: df.sample(1))
#                        .sort_index()
#                        [['serum', 'age_cohort']]
#                       )
    
#     color_dict = {
#         '0-5': '#E69F00',
#         '15-18': '#56B4E9',
#         '40-45': '#009E73'

#     }

#     color_scheme_df['serum'] = color_scheme_df['serum'].astype(str)
#     color_scheme_df['color'] = color_scheme_df['age_cohort'].map(color_dict)

#     dists = [[1] for serum in sera]
#     serum_to_color = color_scheme_df[['serum', 'color']].set_index('serum')['color'].to_dict()
#     colors = [[serum_to_color[serum]] for serum in sera]

#     default_circle_color = 'none'
#     default_label_color = 'black'

#     circle_colors = []

#     for serum in sera:
#         circle_colors.append(default_circle_color)

#     # plot the multidimensional scaling result
#     plot_size = 4
#     fig, ax = plt.subplots(figsize=(plot_size, plot_size))
#     xs = locs[:, 0]
#     ys = locs[:, 1]
#     for x, y, dist, color, circle_color in zip(xs, ys, dists, colors, circle_colors):
#         draw_pie(dist, x, y,
#                  size=100,
#                  ax=ax,
#                  colors=color,
#                  alpha=0.7,
#                  circle_color=circle_color,
#                  )
#     ax.set_aspect('equal', adjustable='box')  # same distance on both axes
#     ax.set_xticks([])  # no x-ticks
#     ax.set_yticks([])  # no y-ticks
#     ax.margins(0.09)  # increase padding from axes

    
#     plt.show(fig)
#     plt.close(fig)   