In [1]:
import altair as alt

import pandas as pd

import itertools

import numpy

import warnings
warnings.filterwarnings('ignore')

from IPython.utils import io

import glob

In [2]:
import os
os.chdir('../../../')

# HK19 DMS summary df for MDS analysis

In [3]:
# define samples in each age cohort
ped_sera = [2367, 3944, 2462, 2389, 2323, 2388, 3973, 4299, 4584]
teen_sera = [2350, 2365, 2380, 2382, 3866, 3856, 3857, 3862]
adult_sera = ['33C', '34C', '197C', '199C', '215C', '210C', '74C', '68C', '150C', '18C']
infant = ['2462']
# misc_adults = ['AUSAB-05', 'AUSAB-08', 'AUSAB-16', 'AUSAB-07', 'AUSAB-11', 'AUSAB-13']
ferrets = ['ferret_1', 'ferret_2', 'ferret_3']

# get list of lists for samples divided by age group
serum_lists = [ped_sera, teen_sera, adult_sera, infant, ferrets]
age_cohorts = ['0-5 years', '15-18 years', '40-45 years', 'infant', 'ferret']

# adjust this if we want more stringent filtering
min_times_seen = 5

df_list = []

i = 0 # for looping across age cohort definitions

for list in serum_lists:
    for serum in list:
        # reading in values from averaged libA and libB models
        avg_df = pd.read_csv(f'results/antibody_escape/{serum}_icXX_avg.csv'
                            ).query(f"`times_seen` >= {min_times_seen}")
        
        avg_df = avg_df[['site', 'wildtype', 'mutant', 'log2 fold change IC90 mean']]

        avg_df = avg_df.rename(columns={'log2 fold change IC90 mean': 'ic90_mean'})
        
        serum = str(serum) # ped / teen sera automatically read as ints
        avg_df['serum'] = serum
        avg_df['age_cohort'] = age_cohorts[i]

        # also get summed and mean site scores to check AA-level vs site-level metrics
        avg_df['sitewise_ic90_sum'] = avg_df['ic90_mean'].groupby(avg_df['site']).transform('sum')
        avg_df['sitewise_ic90_mean'] = avg_df['ic90_mean'].groupby(avg_df['site']).transform('mean')

        df_list.append(avg_df)

    i+=1

# concat to final df
escape_df = pd.concat(df_list).reset_index(drop=True)

escape_df.head()

Unnamed: 0,site,wildtype,mutant,ic90_mean,serum,age_cohort,sitewise_ic90_sum,sitewise_ic90_mean
0,-2,D,Y,-0.127,2367,0-5 years,-0.127,-0.127
1,1,Q,R,-0.1413,2367,0-5 years,-0.1413,-0.1413
2,2,K,N,0.0437,2367,0-5 years,0.0437,0.0437
3,3,I,A,0.0551,2367,0-5 years,-0.7658,-0.042544
4,3,I,D,0.0938,2367,0-5 years,-0.7658,-0.042544


In [4]:
escape_df.to_csv('scratch_notebooks/figure_drafts/umap_analysis/hk19_escape_df_full.csv')

### Generate escape df for just significant sites

In [5]:
# define site list
site_list = [50, 82, 103, 121, 122, 124, 131, 135, 137, 138, 145, 156, 157, 
              159, 160, 186, 188, 189, 192, 193, 220, 224, 244, 276]

escape_df_filtered = escape_df[escape_df['site'].isin(site_list)]
escape_df.to_csv('scratch_notebooks/figure_drafts/umap_analysis/hk19_escape_df_filt_sites.csv')

# Repeat for Perth09 data

In [6]:
# define samples in each age cohort
sample_dict = {
    "vietnam_ped": [
        "age 2.1 (Vietnam)", 
        "age 2.2 (Vietnam)",
        "age 2.4 (Vietnam)",
        "age 2.5 (Vietnam)",
        "age 2.5b (Vietnam)",
        "age 3.3 (Vietnam)", 
        "age 3.3b (Vietnam)",
        "age 3.4 (Vietnam)", 
        "age 3.5 (Vietnam)",
    ], 
    "vietnam_adult": [
        "age 30.5 (Vietnam)",
        "age 31.5 (Vietnam)",
        "age 33.5 (Vietnam)",
    ],
    "misc_adult": [
        "age 21 (Seattle)",
        "age 53 (Seattle)",
        "age 64 (Seattle)",
        "age 65 (Seattle)",
    ],
    "ferret": [
        "ferret 1 (Pitt)",
        "ferret 2 (Pitt)",
        "ferret 3 (Pitt)",
        "ferret (WHO)",
    ]
}

perth09_escape = pd.read_csv(f'results/perth2009/merged_escape.csv')[['name', 'site', 'wildtype', 'mutant', 'escape']]

# get summed and mean site scores
perth09_escape['site_escape_sum'] = perth09_escape.groupby(['name', 'site'])['escape'].transform('sum')
perth09_escape['site_escape_mean'] = perth09_escape.groupby(['name', 'site'])['escape'].transform('mean')

# add cohort definition
def find_sample_type(sample_name):
    for sample_type, sample_list in sample_dict.items():
        if sample_name in sample_list:
            return sample_type
    return None

perth09_escape['cohort'] = perth09_escape['name'].apply(find_sample_type)
perth09_escape = perth09_escape.rename(columns={'name': 'serum'})

perth09_escape.head()

Unnamed: 0,serum,site,wildtype,mutant,escape,site_escape_sum,site_escape_mean,cohort
0,age 30.5 (Vietnam),159,F,G,3.482,17.619377,0.880969,vietnam_adult
1,age 30.5 (Vietnam),159,F,N,2.451,17.619377,0.880969,vietnam_adult
2,age 30.5 (Vietnam),159,F,H,2.334,17.619377,0.880969,vietnam_adult
3,age 30.5 (Vietnam),159,F,T,1.514,17.619377,0.880969,vietnam_adult
4,age 30.5 (Vietnam),159,F,S,1.195,17.619377,0.880969,vietnam_adult


In [7]:
perth09_escape.to_csv('scratch_notebooks/figure_drafts/umap_analysis/perth09_escape_df_full.csv')

In [8]:
# define site list
site_list = [50, 82, 103, 121, 122, 124, 131, 135, 137, 138, 145, 156, 157, 
              159, 160, 186, 188, 189, 192, 193, 220, 224, 244, 276]

perth09_escape_filtered = perth09_escape[perth09_escape['site'].isin(site_list)]

perth09_escape.to_csv('scratch_notebooks/figure_drafts/umap_analysis/perth09_escape_df_filt_sites.csv')