# Counts of variants

This notebook analyzes the counts of the different variants.

Import Python modules:

In [1]:
import os

import Bio.SeqIO

import altair as alt

import dms_variants.codonvarianttable

import pandas as pd

import yaml

In [2]:
os.chdir('../../')

In [3]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

Get configuration information:

In [4]:
# If you are running notebook interactively rather than in pipeline that handles
# working directories, you may have to first `os.chdir` to appropriate directory.

with open("config.yaml") as f:
    config = yaml.safe_load(f)

Read information on the barcode runs:

In [5]:
barcode_runs = pd.read_csv(config["processed_barcode_runs"])

assert len(barcode_runs) == barcode_runs["library_sample"].nunique()

In [6]:
counts = pd.concat(
    [
        pd.read_csv(os.path.join(subdir, f"{library_sample}.csv")).assign(valid=valid)
        for library_sample in barcode_runs["library_sample"]
        for (subdir, valid) in [
            (config["barcode_counts_dir"], True),
            (config["barcode_counts_invalid_dir"], False),
        ]
    ]
)

In [11]:
fates = (
    pd.concat(
        [
            pd.read_csv(
                os.path.join(config["barcode_fates_dir"], f"{library_sample}.csv")
            )
            for library_sample in barcode_runs["library_sample"]
        ]
    )
    .merge(barcode_runs, on=["library", "sample"], validate="many_to_one")
    .drop(columns=["fastq_R1", "notes"])
    .assign(
        valid=lambda x: x["fate"] == "valid barcode",
        not_valid=lambda x: ~x["valid"],
    )
)

selection_cols = [
    "exclude_after_counts",
    "antibody",
    "virus_batch",
    "sample_type",
    "date",
    "library",
]

selections = [
    alt.selection_point(
        fields=[col],
        bind=alt.binding_select(
            options=[None] + fates[col].dropna().unique().tolist(),
            labels=["all"] + [str(x) for x in fates[col].dropna().unique()],
            name=col,
        ),
    )
    for col in selection_cols
]

Get which libraries each barcode maps to:

In [7]:
barcodes_by_library = (
    pd.read_csv(config["codon_variants"])
    .groupby(["barcode", "target"], as_index=False)
    .aggregate(
        libraries_w_barcode=pd.NamedAgg("library", lambda s: ", ".join(s.unique())),
        n_libraries_w_barcode=pd.NamedAgg("library", "nunique"),
    )
)

display(
    barcodes_by_library.groupby(["target", "libraries_w_barcode"]).aggregate(
        n_barcodes=pd.NamedAgg("barcode", "count")
    )
)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_barcodes
target,libraries_w_barcode,Unnamed: 2_level_1
gene,libA,66580
gene,"libA, libB",1
gene,libB,85133
neut_standard,"libA, libB",26


Now look at the overall barcode counts for each sample and see how many map to the expected library or to some other library.
Having many barcodes that map to a different library can be an indication of contamination unless there is a lot of expected overlap between the two libraries (which would be indicated in table above):

In [8]:
counts_by_library = (
    counts.merge(barcodes_by_library, on="barcode", validate="many_to_one")
    .groupby(
        ["library", "sample", "libraries_w_barcode", "target", "n_libraries_w_barcode"],
        as_index=False,
    )
    .aggregate(n_counts=pd.NamedAgg("count", "sum"))
    .assign(
        frac_counts=lambda x: x["n_counts"]
        / x.groupby(["library", "sample"])["n_counts"].transform("sum"),
    )
    .merge(barcode_runs)
    .assign(
        category=lambda x: x["libraries_w_barcode"].where(
            x["target"] == "gene", x["target"]
        )
    )
    .drop(
        columns=[
            "fastq_R1",
            "notes",
            "antibody_concentration",
            "target",
            "libraries_w_barcode",
        ]
    )
)

Plot which libraries overall barcode counts map to for each sample:

In [12]:
ordered_cats = (
    counts_by_library.sort_values(["n_libraries_w_barcode", "category"])["category"]
    .unique()
    .tolist()
)

category_selection = alt.selection_point(fields=["category"], bind="legend")

counts_by_library_chart = (
    alt.Chart(
        counts_by_library.assign(
            order=lambda x: x["category"].map(lambda s: ordered_cats.index(s))
        )
    )
    .encode(
        x=alt.X("frac_counts", scale=alt.Scale(domain=[0, 1])),
        y=alt.Y("library_sample", title=None),
        color=alt.Color("category", scale=alt.Scale(domain=ordered_cats)),
        order="order",
        tooltip=[
            alt.Tooltip(c, format=".2g") if c in {"n_counts", "frac_counts"} else c
            for c in counts_by_library.columns
            if c not in {"library_sample"}
        ],
    )
    .mark_bar()
    .properties(width=250, height=alt.Step(15))
    .configure_axis(labelLimit=500)
    .add_params(*selections, category_selection)
    .transform_filter(category_selection)
)
for selection in selections:
    counts_by_library_chart = counts_by_library_chart.transform_filter(selection)

counts_by_library_chart

In [27]:
neut_counts

Unnamed: 0,library,sample,n_libraries_w_barcode,n_counts,frac_counts,date,virus_batch,sample_type,antibody,replicate,exclude_after_counts,library_sample,category
305,libA,230312_1_antibody_2367_0.00034_1,2,4351381,0.779162,230312,1,antibody,2367.0,1,no,libA_230312_1_antibody_2367_0.00034_1,neut_standard
308,libA,230312_1_antibody_2367_0.00034_2,2,492347,0.255159,230312,1,antibody,2367.0,2,no,libA_230312_1_antibody_2367_0.00034_2,neut_standard
312,libA,230312_1_antibody_2367_0.00034_3,2,68975,0.036411,230312,1,antibody,2367.0,3,no,libA_230312_1_antibody_2367_0.00034_3,neut_standard
316,libA,230312_1_antibody_2367_0.00034_4,2,6364,0.003517,230312,1,antibody,2367.0,4,no,libA_230312_1_antibody_2367_0.00034_4,neut_standard
319,libA,230312_1_antibody_2367_0.00068_1,2,6324147,0.869239,230312,1,antibody,2367.0,1,no,libA_230312_1_antibody_2367_0.00068_1,neut_standard
322,libA,230312_1_antibody_2367_0.00068_2,2,1220178,0.395235,230312,1,antibody,2367.0,2,no,libA_230312_1_antibody_2367_0.00068_2,neut_standard
326,libA,230312_1_antibody_2367_0.00068_3,2,120789,0.063865,230312,1,antibody,2367.0,3,no,libA_230312_1_antibody_2367_0.00068_3,neut_standard
330,libA,230312_1_antibody_2367_0.00068_4,2,13213,0.006874,230312,1,antibody,2367.0,4,no,libA_230312_1_antibody_2367_0.00068_4,neut_standard
334,libA,230312_1_antibody_2367_0.00136_1,2,10963658,0.972131,230312,1,antibody,2367.0,1,no,libA_230312_1_antibody_2367_0.00136_1,neut_standard
337,libA,230312_1_antibody_2367_0.00136_2,2,4779790,0.777252,230312,1,antibody,2367.0,2,no,libA_230312_1_antibody_2367_0.00136_2,neut_standard


In [80]:
neut_counts = counts_by_library.loc[(counts_by_library['category'] == 'neut_standard') &
                                    (counts_by_library['date'] == 230312) &
                                    (counts_by_library['sample_type'] == 'antibody')
                                   ]

neut_counts['selection_concentration'] = neut_counts['sample'].apply(lambda x: x.split('_')[4]).astype(float)

neut_counts = neut_counts[['frac_counts', 'replicate', 'selection_concentration']]

neut_counts['neut_std'] = 'spike-in'

neut_counts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 305 to 359
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   frac_counts              16 non-null     float64
 1   replicate                16 non-null     int64  
 2   selection_concentration  16 non-null     float64
 3   neut_std                 16 non-null     object 
dtypes: float64(2), int64(1), object(1)
memory usage: 640.0+ bytes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [81]:
h6_2367_frac_counts = [0.061, 0.19, 0.60, 0.79]
h6_2367_ab_conc = [0.00034, 0.00068, 0.00136, 0.00272]

h6_2367 = pd.DataFrame({'selection_concentration': h6_2367_ab_conc,
                        'frac_counts': h6_2367_frac_counts,
                        'replicate': 1,
                        'neut_std': 'H6'
                       }    
                      )

h6_2367

Unnamed: 0,selection_concentration,frac_counts,replicate,neut_std
0,0.00034,0.061,1,H6
1,0.00068,0.19,1,H6
2,0.00136,0.6,1,H6
3,0.00272,0.79,1,H6


In [82]:
neut_counts_full = pd.concat([neut_counts, h6_2367])
neut_counts_full.to_csv('scratch_notebooks/230313_get-spike-in-barcodes/neut_counts_h6_spikein.csv', index=False)

In [83]:
# neut_counts_full['selection_concentration'] = neut_counts['selection_concentration'].astype(float)
neut_counts_full

Unnamed: 0,frac_counts,replicate,selection_concentration,neut_std
305,0.779162,1,0.00034,spike-in
308,0.255159,2,0.00034,spike-in
312,0.036411,3,0.00034,spike-in
316,0.003517,4,0.00034,spike-in
319,0.869239,1,0.00068,spike-in
322,0.395235,2,0.00068,spike-in
326,0.063865,3,0.00068,spike-in
330,0.006874,4,0.00068,spike-in
334,0.972131,1,0.00136,spike-in
337,0.777252,2,0.00136,spike-in


In [91]:
spikein_mapping = {
    1: 5,
    2: 0.5,
    3: 0.05,
    4: 0.005
}

neut_counts['replicate'] = neut_counts['replicate'].map(spikein_mapping)

In [95]:
neut_counts = neut_counts.rename(columns={'ng_spike-in_added': 'ng_spike-in'})
neut_counts

Unnamed: 0,frac_counts,ng_spike-in,selection_concentration,neut_std
305,0.779162,5.0,0.00034,spike-in
308,0.255159,0.5,0.00034,spike-in
312,0.036411,0.05,0.00034,spike-in
316,0.003517,0.005,0.00034,spike-in
319,0.869239,5.0,0.00068,spike-in
322,0.395235,0.5,0.00068,spike-in
326,0.063865,0.05,0.00068,spike-in
330,0.006874,0.005,0.00068,spike-in
334,0.972131,5.0,0.00136,spike-in
337,0.777252,0.5,0.00136,spike-in


In [101]:
neut_counts_spikein = neut_counts.loc[neut_counts['neut_std'] == 'spike-in']

frac_neut_standard_chart = (
    alt.Chart(neut_counts, title='RNA spike-in as neut standard')
    .encode(
        x=alt.X("selection_concentration", 
                title="2367 serum selection concentration"
               ),
        y=alt.Y(
            "frac_counts",
            title="fraction counts from neutralization standard",
            scale=alt.Scale(type="symlog", constant=0.02, domainMax=1),
        ),
        
        color=alt.Color('ng_spike-in:N', 
                        legend=alt.Legend(orient="right", title='ng spike-in per well'))
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=250, height=250)
)

frac_neut_standard_chart

In [97]:
h6_2367

Unnamed: 0,selection_concentration,frac_counts,replicate,neut_std
0,0.00034,0.061,1,H6
1,0.00068,0.19,1,H6
2,0.00136,0.6,1,H6
3,0.00272,0.79,1,H6


In [99]:
frac_h6 = (
    alt.Chart(h6_2367, title='H6 as neut standard')
    .encode(
        x=alt.X("selection_concentration", 
                title="2367 serum selection concentration"
               ),
        y=alt.Y(
            "frac_counts",
            title="fraction counts from neutralization standard",
            scale=alt.Scale(type="symlog", constant=0.02, domainMax=1),
        ),
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=250, height=250)
)

frac_h6

In [102]:
h6_1c04_frac_counts = [0.0023, 0.0095, 0.055, 0.14, 0.24]
h6_1c04_ab_conc = [0.05, 0.1, 0.2, 0.4, 0.8]

h6_1c04 = pd.DataFrame({'selection_concentration': h6_1c04_ab_conc,
                        'frac_counts': h6_1c04_frac_counts,
                        'replicate': 1,
                        'neut_std': 'H6'
                       }    
                      )

h6_1c04

Unnamed: 0,selection_concentration,frac_counts,replicate,neut_std
0,0.05,0.0023,1,H6
1,0.1,0.0095,1,H6
2,0.2,0.055,1,H6
3,0.4,0.14,1,H6
4,0.8,0.24,1,H6


In [103]:
frac_h6 = (
    alt.Chart(h6_1c04, title='H6 as neut standard')
    .encode(
        x=alt.X("selection_concentration", 
                title="1C04 mAb selection concentration"
               ),
        y=alt.Y(
            "frac_counts",
            title="fraction counts from neutralization standard",
            scale=alt.Scale(type="symlog", constant=0.02, domainMax=1),
        ),
    )
    .mark_point(filled=True, size=50, opacity=0.7)
    .properties(width=250, height=250)
)

frac_h6