# Analyze the counts of the different variants
Here we analyze the counts of different variants.

First, import Python modules:

In [1]:
import os
import pickle

import altair as alt

import Bio.SeqIO

import dms_variants.codonvarianttable
import dms_variants.utils

import pandas as pd

import yaml

In [2]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

Change working directory to top directory of repo:

In [3]:
os.chdir('../')

Read configuration:

In [4]:
with open('config.yaml') as f:
    config = yaml.safe_load(f)

Read the barcode counts and barcode runs:

In [5]:
barcode_runs = (
    pd.read_csv(config["barcode_runs"])
    .assign(antibody=lambda x: x["antibody"].fillna("no antibody"))
)

barcode_counts = pd.read_csv(config["barcode_counts"])

Read the parental gene sequence:

In [6]:
geneseq = str(Bio.SeqIO.read(config['gene_sequence_codon'], 'fasta').seq)
assert len(geneseq) % 3 == 0

Get the primary target:

In [7]:
primary_target = config["gene"]

## Create codon-variant table and add counts:
Create a [dms_variants codon-variant table](https://jbloomlab.github.io/dms_variants/codonvariant_sim_data.html):

In [8]:
variants = dms_variants.codonvarianttable.CodonVariantTable(
    barcode_variant_file=config["virus_variants_w_neut_standard"],
    geneseq=geneseq,
    substitutions_are_codon=True,
    substitutions_col="codon_substitutions",
    allowgaps=True,
    primary_target=config["gene"],
)

Now add the sample counts:

In [9]:
variants.add_sample_counts_df(barcode_counts)

Save the pickled codon-variant table:

In [10]:
with open(config["codon_variant_table_pickle"], "wb") as f:
    pickle.dump(variants, f)

## Average mutations per variant
Compute the average mutations per variant:

In [49]:
avg_muts = (
    variants.numCodonMutsByType(
        variant_type="all",
        libraries=variants.libraries,
    )
    .merge(barcode_runs, validate="many_to_one")
    .drop(columns=["num_muts_count", "antibody_concentration", "replicate", "fastq_R1", "notes"])
)

Edit dataframe to only include data for talk, with clearer labels

In [50]:
avg_muts.head()

Unnamed: 0,library,sample,mutation_type,count,number,date,virus_batch,sample_type,antibody,library_sample
0,libA,220121_1_no-antibody_control_100x_1,nonsynonymous,7789129,2.741342,220121,1,no-antibody_control,no antibody,libA_220121_1_no-antibody_control_100x_1
1,libA,220121_1_no-antibody_control_100x_1,synonymous,7789129,0.172177,220121,1,no-antibody_control,no antibody,libA_220121_1_no-antibody_control_100x_1
2,libA,220121_1_no-antibody_control_100x_1,stop,7789129,0.010285,220121,1,no-antibody_control,no antibody,libA_220121_1_no-antibody_control_100x_1
3,libA,220121_1_no-antibody_control_100x_1,deletion,7789129,3.4e-05,220121,1,no-antibody_control,no antibody,libA_220121_1_no-antibody_control_100x_1
4,libA,220121_1_no-antibody_control_100x_2_rt1,nonsynonymous,5869174,2.738756,220121,1,no-antibody_control,no antibody,libA_220121_1_no-antibody_control_100x_2_rt1


In [51]:
# edit names for virus meeting, drop to just 220224 samples

# avg_muts_reformatted = avg_muts.duplicate()

short_names = {
#     '220121_1_no-antibody_control_50x_1': 'no-Ab_1',
#     '220121_1_no-antibody_control_50x_2_rt1': 'no-Ab_2_RT-1',
#     '220121_1_no-antibody_control_50x_2_rt2': 'no-Ab_2_RT-2',
    '220224_1_antibody_AUSAB-07_0.000158_1_rt1': 'serum-IC65.00',
#     '220224_1_antibody_AUSAB-07_0.000158_1_rt2': 'serum-IC65_1_RT-2',
#     '220224_1_antibody_AUSAB-07_0.000158_2': 'serum-IC65_2',
    '220224_1_antibody_AUSAB-07_0.000395_1': 'serum-IC90.00',
#     '220224_1_antibody_AUSAB-07_0.000395_2': 'serum-IC90_2',
    '220224_1_antibody_AUSAB-07_0.00158_1': 'serum-IC99.00',
#     '220224_1_antibody_AUSAB-07_0.00158_2': 'serum-IC99_2',
    '220224_1_antibody_AUSAB-07_0.00632_1': 'serum-IC99.90',
#     '220224_1_antibody_AUSAB-07_0.00632_2': 'serum-IC99.90_2',
    '220224_1_antibody_AUSAB-07_0.0158_1': 'serum-IC99.99',
#     '220224_1_antibody_AUSAB-07_0.0158_2_rt1': 'serum-IC99.99_2_RT-1',
#     '220224_1_antibody_AUSAB-07_0.0158_2_rt2': 'serum-IC99.99_2_RT-2',
    '220224_1_no-antibody_control_670ng_1': 'no-Ab',
#     '220224_1_no-antibody_control_670ng_2': 'no-Ab_2',
    '220224_1_no-antibody_control_plasmid': 'plasmid'
}



avg_muts['sample'] = avg_muts['sample'].map(short_names)
avg_muts_reformatted = avg_muts.dropna(subset=['sample'])

avg_muts_reformatted['condition'] = avg_muts_reformatted['sample'].str.split('_').str[0]                     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [52]:
avg_muts_reformatted.head()

Unnamed: 0,library,sample,mutation_type,count,number,date,virus_batch,sample_type,antibody,library_sample,condition
48,libA,serum-IC65.00,nonsynonymous,1892557,2.722493,220224,1,antibody,AUSAB-07,libA_220224_1_antibody_AUSAB-07_0.000158_1_rt1,serum-IC65.00
49,libA,serum-IC65.00,synonymous,1892557,0.168752,220224,1,antibody,AUSAB-07,libA_220224_1_antibody_AUSAB-07_0.000158_1_rt1,serum-IC65.00
50,libA,serum-IC65.00,stop,1892557,0.010324,220224,1,antibody,AUSAB-07,libA_220224_1_antibody_AUSAB-07_0.000158_1_rt1,serum-IC65.00
51,libA,serum-IC65.00,deletion,1892557,3.1e-05,220224,1,antibody,AUSAB-07,libA_220224_1_antibody_AUSAB-07_0.000158_1_rt1,serum-IC65.00
60,libA,serum-IC90.00,nonsynonymous,2552542,2.712283,220224,1,antibody,AUSAB-07,libA_220224_1_antibody_AUSAB-07_0.000395_1,serum-IC90.00


Make an interactive chart of average mutations per variant.
You can select which groupings to show, and click on mutation type to only show that mutation type:

In [28]:
small_df = avg_muts_reformatted.loc[((avg_muts_reformatted['sample'] == 'plasmid') | (avg_muts_reformatted['sample'] == 'no-Ab_1'))]
small_df

Unnamed: 0,library,sample,mutation_type,count,number,date,virus_batch,sample_type,antibody,library_sample,condition
108,libA,no-Ab_1,nonsynonymous,2452009,2.739215,220224,1,no-antibody_control,no antibody,libA_220224_1_no-antibody_control_670ng_1,no-Ab
109,libA,no-Ab_1,synonymous,2452009,0.173318,220224,1,no-antibody_control,no antibody,libA_220224_1_no-antibody_control_670ng_1,no-Ab
110,libA,no-Ab_1,stop,2452009,0.010307,220224,1,no-antibody_control,no antibody,libA_220224_1_no-antibody_control_670ng_1,no-Ab
111,libA,no-Ab_1,deletion,2452009,4.9e-05,220224,1,no-antibody_control,no antibody,libA_220224_1_no-antibody_control_670ng_1,no-Ab
116,libA,plasmid,nonsynonymous,9795560,3.449037,220224,1,no-antibody_control,no antibody,libA_220224_1_no-antibody_control_plasmid,plasmid
117,libA,plasmid,synonymous,9795560,0.190656,220224,1,no-antibody_control,no antibody,libA_220224_1_no-antibody_control_plasmid,plasmid
118,libA,plasmid,stop,9795560,0.050777,220224,1,no-antibody_control,no antibody,libA_220224_1_no-antibody_control_plasmid,plasmid
119,libA,plasmid,deletion,9795560,0.000459,220224,1,no-antibody_control,no antibody,libA_220224_1_no-antibody_control_plasmid,plasmid


In [53]:
# mut_types_sort = ["nonsynonymous", "synonymous", "deletion", "stop"]

# selections = [
#     alt.selection_single(
#         fields=[col],
#         bind=alt.binding_select(
#             options=[None] + avg_muts_reformatted[col].unique().tolist(),
#             labels=["all"] + [str(x) for x in avg_muts_reformatted[col].unique()],
#             name=col,
#         )
#     )
#     for col in ["library", "date", "sample_type", "virus_batch", "antibody"]
# ]

mutation_type_selection = alt.selection_multi(fields=["mutation_type"], bind="legend")

avg_muts_chart = (
    alt.Chart(
        avg_muts_reformatted.assign(order=lambda x: x["mutation_type"]
                        .map(lambda m: mut_types_sort.index(m))
                        ),
    )
    .encode(
        x=alt.X(
            "number",
            title="average mutations per variant",
            axis=alt.Axis(grid=False),
        ),
        y=alt.Y(
            "sample",
            title=None,
            sort=['plasmid', 'no-Ab']
        ),
        color=alt.Color(
            "mutation_type",
            scale=alt.Scale(domain=mut_types_sort),
        ),
        order=alt.Order("order", sort="descending"),
        tooltip=[alt.Tooltip(c, format=".3g") if c in ["count", "number"] else c
                 for c in avg_muts.columns if c != "sample"],
    )
    .mark_bar()
    .properties(height=alt.Step(13), width=300)
    .configure_axis(labelLimit=500)
    .add_selection(mutation_type_selection, *selections)
    .transform_filter(mutation_type_selection)
)

for selection in selections:
    avg_muts_chart = avg_muts_chart.transform_filter(selection)

avg_muts_chart
avg_muts_chart.save('scratch_notebooks/scratch_figs/muts_per_variant.pdf')

In [22]:
mut_types_sort = ["nonsynonymous", "synonymous", "deletion", "stop"]

selections = [
    alt.selection_single(
        fields=[col],
        bind=alt.binding_select(
            options=[None] + avg_muts_reformatted[col].unique().tolist(),
            labels=["all"] + [str(x) for x in avg_muts_reformatted[col].unique()],
            name=col,
        )
    )
    for col in ["library", "date", "sample_type", "virus_batch", "antibody"]
]

mutation_type_selection = alt.selection_multi(fields=["mutation_type"], bind="legend")

avg_muts_chart = (
    alt.Chart(
        avg_muts_reformatted.assign(order=lambda x: x["mutation_type"]
                        .map(lambda m: mut_types_sort.index(m))
                        ),
    )
    .encode(
        x=alt.X(
            "number",
            title="average mutations per variant",
            axis=alt.Axis(grid=False),
        ),
        y=alt.Y(
            "sample",
            title=None,
        ),
        color=alt.Color(
            "mutation_type",
            scale=alt.Scale(domain=mut_types_sort),
        ),
        order=alt.Order("order", sort="descending"),
        tooltip=[alt.Tooltip(c, format=".3g") if c in ["count", "number"] else c
                 for c in avg_muts.columns if c != "sample"],
    )
    .mark_bar()
    .properties(height=alt.Step(13), width=300)
    .configure_axis(labelLimit=500)
    .add_selection(mutation_type_selection, *selections)
    .transform_filter(mutation_type_selection)
)

for selection in selections:
    avg_muts_chart = avg_muts_chart.transform_filter(selection)

avg_muts_chart

## Mutation frequencies along the gene
Get counts of amino-acid mutations per-site along the gene:

In [64]:
    variants.mutCounts(
        variant_type="all",
        mut_type="aa",
        libraries=variants.libraries,
    )

Unnamed: 0,library,sample,mutation,count,mutation_type,site
0,libA,220121_1_no-antibody_control_100x_1,K297I,208154,nonsynonymous,297
1,libA,220121_1_no-antibody_control_100x_1,L263H,156635,nonsynonymous,263
2,libA,220121_1_no-antibody_control_100x_1,L263S,113951,nonsynonymous,263
3,libA,220121_1_no-antibody_control_100x_1,V366M,81112,nonsynonymous,366
4,libA,220121_1_no-antibody_control_100x_1,R402S,76351,nonsynonymous,402
...,...,...,...,...,...,...
359095,libA,220224_1_no-antibody_control_plasmid,Y530R,0,nonsynonymous,530
359096,libA,220224_1_no-antibody_control_plasmid,Y530S,0,nonsynonymous,530
359097,libA,220224_1_no-antibody_control_plasmid,Y530T,0,nonsynonymous,530
359098,libA,220224_1_no-antibody_control_plasmid,Y530V,0,nonsynonymous,530


In [65]:
# read reference site numbering
site_numbering_map = pd.read_csv(config["site_numbering_map"])

site_freqs = (
    variants.mutCounts(
        variant_type="all",
        mut_type="aa",
        libraries=variants.libraries,
    )
    .query("count > 0")
    .rename(columns={"site": "sequential_site"})
    .merge(
        site_numbering_map,
        how="left",
        on="sequential_site",
        validate="many_to_one",
    )
    .assign(
        wildtype=lambda x: x["mutation"].str[0],
        mutant=lambda x: x["mutation"].str[-1],
        reference_site=lambda x: x["reference_site"].astype(str),
    )
    .assign(mutant=lambda x: x["mutant"] + "=" + x["count"].map("{:.2g}".format))
    .sort_values("count", ascending=False)
    .groupby(
        ["library", "sample", "sequential_site", "reference_site", "wildtype"],
        observed=True,
        as_index=False,
    )
    .aggregate(
        count=pd.NamedAgg("count", "sum"),
        mutants=pd.NamedAgg("mutant", ", ".join),
    )
    .merge(
        variants.n_variants_df(libraries=variants.libraries, primary_target_only=True)
        .rename(columns={"count": "n_variants"})
    )
    .assign(percent=lambda x: 100 * x["count"] / x["n_variants"])
    .merge(barcode_runs[["library", "sample", "library_sample", "sample_type"]])
    .drop(columns=["library", "sample"])
)

In [56]:
site_freqs.head()

Unnamed: 0,sequential_site,reference_site,wildtype,count,mutants,n_variants,percent,library_sample,sample_type
0,1,-19,M,3,I=3,7789129,3.9e-05,libA_220121_1_no-antibody_control_100x_1,no-antibody_control
1,3,-17,A,4,T=4,7789129,5.1e-05,libA_220121_1_no-antibody_control_100x_1,no-antibody_control
2,11,-9,A,2,S=2,7789129,2.6e-05,libA_220121_1_no-antibody_control_100x_1,no-antibody_control
3,13,-7,V,9,L=9,7789129,0.000116,libA_220121_1_no-antibody_control_100x_1,no-antibody_control
4,14,-6,A,6,S=6,7789129,7.7e-05,libA_220121_1_no-antibody_control_100x_1,no-antibody_control


In [66]:
# edit names for virus meeting, drop to just 220224 samples

# avg_muts_reformatted = avg_muts.duplicate()

short_names = {
#     '220121_1_no-antibody_control_50x_1': 'no-Ab_1',
#     '220121_1_no-antibody_control_50x_2_rt1': 'no-Ab_2_RT-1',
#     '220121_1_no-antibody_control_50x_2_rt2': 'no-Ab_2_RT-2',
    'libA_220224_1_antibody_AUSAB-07_0.000158_1_rt1': 'AUSAB-07_IC65.00',
#     '220224_1_antibody_AUSAB-07_0.000158_1_rt2': 'serum-IC65_1_RT-2',
#     '220224_1_antibody_AUSAB-07_0.000158_2': 'serum-IC65_2',
    'libA_220224_1_antibody_AUSAB-07_0.000395_1': 'AUSAB-07_IC90.00',
#     '220224_1_antibody_AUSAB-07_0.000395_2': 'serum-IC90_2',
    'libA_220224_1_antibody_AUSAB-07_0.00158_1': 'AUSAB-07_IC99.00',
#     '220224_1_antibody_AUSAB-07_0.00158_2': 'serum-IC99_2',
    'libA_220224_1_antibody_AUSAB-07_0.00632_1': 'AUSAB-07_IC99.90',
#     '220224_1_antibody_AUSAB-07_0.00632_2': 'serum-IC99.90_2',
    'libA_220224_1_antibody_AUSAB-07_0.0158_1': 'AUSAB-07_IC99.99',
#     '220224_1_antibody_AUSAB-07_0.0158_2_rt1': 'serum-IC99.99_2_RT-1',
#     '220224_1_antibody_AUSAB-07_0.0158_2_rt2': 'serum-IC99.99_2_RT-2',
    'libA_220224_1_no-antibody_control_670ng_1': 'no-Ab_infection',
#     '220224_1_no-antibody_control_670ng_2': 'no-Ab_2',
    'libA_220224_1_no-antibody_control_plasmid': 'plasmid'
}



site_freqs['library_sample'] = site_freqs['library_sample'].map(short_names)
site_freqs = site_freqs.dropna(subset=['library_sample'])

# d['condition'] = avg_muts_reformatted['sample'].str.split('_').str[0]  

Now make chart.
You can mouse over points to see details.
Note sites are numbered on the x-axis in sequential numbering, but the mouseovers also show the reference-based numbering:

In [70]:
zoom_brush = alt.selection_interval(
    encodings=['x'], mark=alt.BrushConfig(stroke='black', strokeWidth=2),
)
zoom_bar = (
    alt.Chart(site_freqs[["sequential_site"]].drop_duplicates())
    .mark_rect(color='lightgrey')
    .encode(x=alt.X("sequential_site", title=None, scale=alt.Scale(nice=False, zero=False)))
    .add_selection(zoom_brush)
    .properties(width=500, height=15, title='site zoom bar')
)

site_freqs_base = (
    alt.Chart()
    .encode(
        x=alt.X("sequential_site", scale=alt.Scale(nice=False, zero=False)),
        y=alt.Y('percent', title='% variants with mutation'),
        tooltip=[
            alt.Tooltip(c, format=".3g") if c in {"percent", "count", "n_variants"} else c
            for c in site_freqs.columns
        ],
        color="sample_type",
    )
    .properties(height=125, width=375)
)

site_freqs_chart = (
    alt.layer(
        site_freqs_base.mark_point(filled=True, size=25),
        site_freqs_base.mark_line(size=0.4),
        data=site_freqs,
    )
    .add_selection(zoom_brush)
    .facet(facet=alt.Facet("library_sample", title=None, sort=['plasmid', 'no-Ab_infection']), columns=2)
    .transform_filter(zoom_brush)
)
    
site_freqs_chart = site_freqs_chart

site_freqs_zoom_chart = (zoom_bar & site_freqs_chart).configure_axis(grid=False)

site_freqs_zoom_chart.save('scratch_notebooks/scratch_figs/AUSAB-07_selection.pdf')

## To what extent do individual variants dominate counts?
We look to see how much individual variants dominate the counts, only looking at the primary target (not any neutralization standards):

In [None]:
variant_counts = (
    variants.variant_count_df
    .query("target == @primary_target")
    [["library", "sample", "barcode", "count", "aa_substitutions"]]
    .merge(barcode_runs.drop(columns=["fastq_R1", "notes", "antibody_concentration"]))
    .assign(percent=lambda x: 100 * x["count"] / x.groupby("library_sample")["count"].transform("sum"))
    .sort_values(["library", "sample", "count"], ascending=[True, True, False])
)

Get the top 25 variants, and the 10th and 90th percentiles:

In [None]:
top_n = 25

variant_counts_top_n = (
    variant_counts
    .groupby("library_sample")
    .head(n=top_n)
    .merge(
        (
            variant_counts
            .groupby("library_sample", as_index=False)
            .aggregate(
                percentile_10=pd.NamedAgg("percent", lambda s: s.quantile(0.1)),
                percentile_90=pd.NamedAgg("percent", lambda s: s.quantile(0.9)),
                min_percent=pd.NamedAgg("percent", "min"),
                max_percent=pd.NamedAgg("percent", "max"),
            )
        ),
        validate="many_to_one",
    )
)

Now plot these data.
The points show the top variants, and can be moused over for details.
The bars show the 10th to 90th percentiles, and the lines the full span.
Note that these are the **percent** frequencies of variants, not the fractional frequencies:

In [None]:
variant_selector = alt.selection_single(
    on="mouseover",
    empty="none",
    fields=["barcode", "library"],
)

variant_counts_top_n_base = (
    alt.Chart(variant_counts_top_n)
    .encode(
        y=alt.Y("library_sample", title=None),
        color=alt.Color(
            "sample_type",
            scale=alt.Scale(domain=variant_counts_top_n["sample_type"].unique()),
        ),
    )
    .mark_point(filled=True)
    .properties(height=alt.Step(15), width=275)
)

variant_counts_top_n_points = (
    variant_counts_top_n_base
    .encode(
        x=alt.X("percent", title="percent of library"),
        tooltip=[
            alt.Tooltip(c, format=".3g") if c.startswith("percent") else c
            for c in variant_counts_top_n.columns
            if c not in ["library_sample", "min_percent", "max_percent"]
        ],
        opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.5)),
        strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
        size=alt.condition(variant_selector, alt.value(40), alt.value(25)),
    )
    .mark_point(filled=True, stroke="black")
)

variant_counts_top_n_bars = (
    variant_counts_top_n_base
    .encode(
        alt.X("percentile_10"),
        alt.X2("percentile_90"),
    )
    .mark_bar(size=11)
)

variant_counts_top_n_rule = (
    variant_counts_top_n_base
    .encode(
        alt.X("min_percent"),
        alt.X2("max_percent"),
    )
    .mark_rule()
)

variant_counts_top_n_chart = (
    (variant_counts_top_n_points + variant_counts_top_n_bars + variant_counts_top_n_rule)
    .configure_axis(labelLimit=500)
    .add_selection(variant_selector, *selections)
)
for selection in selections:
    variant_counts_top_n_chart = variant_counts_top_n_chart.transform_filter(selection)

variant_counts_top_n_chart

## Pairwise correlation between variant counts
Compute the pairwise correlations between variant counts for each library:

In [None]:
corrs = (
    dms_variants.utils.tidy_to_corr(
        df=variant_counts,
        sample_col="sample",
        label_col="barcode",
        value_col="count",
        group_cols=["library"],
    )
    .assign(r2=lambda x: x["correlation"]**2)
    .drop(columns="correlation")
    .sort_values(["library", "sample_1", "sample_2"])
)

# add other properties and flag samples that share these
suffixes = ["_1", "_2"]
cols_to_add =  ["antibody", "virus_batch", "sample_type", "date"]
for suffix in suffixes:
    corrs = (
        corrs.merge(
            barcode_runs[["library", "sample", *cols_to_add]],
            left_on=["library", f"sample{suffix}"],
            right_on=["library", "sample"],
            validate="many_to_one",
            suffixes=suffixes,
        )
        .drop(columns="sample")
    )
for col in cols_to_add:
    corrs = (
        corrs
        .assign(
            equal=lambda x: x[f"{col}_1"] == x[f"{col}_2"],
            **{col: lambda x: x[f"{col}_1"].where(x["equal"], pd.NA)},
        )
        .drop(columns=[f"{col}{suffix}" for suffix in suffixes])
    )

In [None]:
corr_charts = []
for library, library_corr in corrs.groupby("library"):
    corr_chart = (
        alt.Chart(library_corr)
        .encode(
            alt.X("sample_1", title=None),
            alt.Y("sample_2", title=None),
            color=alt.Color("r2", scale=alt.Scale(zero=True)),
            tooltip=[
                "library",
                "sample_1",
                "sample_2",
                alt.Tooltip("r2", format=".3f")
            ],
        )
        .mark_rect(stroke="black")
        .properties(width=alt.Step(15), height=alt.Step(15), title=library)
        .add_selection(*selections)
    )
    for selection in selections:
        corr_chart = corr_chart.transform_filter(selection)
    corr_charts.append(corr_chart)

corr_chart = (
    alt.vconcat(*corr_charts)
    .configure_axis(labelLimit=500)

)

corr_chart

Try plotting on a log scale, to better resolve differences in pairwise correlations.

In [None]:
corr_charts = []
for library, library_corr in corrs.groupby("library"):
    corr_chart = (
        alt.Chart(library_corr)
        .encode(
            alt.X("sample_1", title=None),
            alt.Y("sample_2", title=None),
            color=alt.Color("r2", scale=alt.Scale(type='log')),
            tooltip=[
                "library",
                "sample_1",
                "sample_2",
                alt.Tooltip("r2", format=".3f")
            ],
        )
        .mark_rect(stroke="black")
        .properties(width=alt.Step(15), height=alt.Step(15), title=library)
        .add_selection(*selections)
    )
    for selection in selections:
        corr_chart = corr_chart.transform_filter(selection)
    corr_charts.append(corr_chart)

corr_chart = (
    alt.vconcat(*corr_charts)
    .configure_axis(labelLimit=500)

)

corr_chart