# Basic Analysis and Visualization of the Spatial GIFT-seq dataset

In [None]:
import gc
import os
# os.chdir("/data1/lareauc/users/varelaa/giftwrap/notebooks")
os.chdir("/home/varelaa/quick_analysis/")
import sys
sys.path.append("./")
from figures import *

import anndata as ad
import giftwrap as gw
import scanpy as sc
import squidpy as sq
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spatialdata as sd
import spatialdata_io as sio
import matplotlib as mpl
from scipy.stats import gaussian_kde
from shapely import Polygon
from geopandas import GeoDataFrame
from shapely import Polygon
from spatialdata import polygon_query
from spatialdata.models import ShapesModel
from spatialdata.transformations import Identity
#!pip install adjustText
import adjustText
mpl.rcParams['figure.dpi'] = 300
RESOLUTION=16  # in um

# Load our datasets

In [None]:
# # First, our GIFT-seq dataset
# dual_probe_gdata = gw.read_h5_file("/data1/lareauc/projects/gapfill/analysis/20250708_visiumHD_benchmark/DualPanel_GIFTWRAP_WTA_FILTER/counts.1.h5")
# gapfill_gdata = gw.read_h5_file("/data1/lareauc/projects/gapfill/analysis/20250708_visiumHD_benchmark/GapFill_GIFTWRAP_WITH_WTA_TEST_UPDATE/counts.1.h5")
#
# # Next the WTAs
# dual_probe_wta = sio.visium_hd(
#     "/data1/lareauc/projects/gapfill/analysis/20250708_visiumHD_benchmark/CL_DualPanel_WTA/outs",
#      dataset_id=''
# )
# gapfill_wta = sio.visium_hd(
#     "/data1/lareauc/projects/gapfill/analysis/20250708_visiumHD_benchmark/CL_GapFill_WTA/outs",
#      dataset_id=''
# )

# First, our GIFT-seq dataset
dual_probe_gdata = gw.read_h5_file("20250708_visiumHD_benchmark/DualPanel_GIFTWRAP_WTA_FILTER/counts.1.h5")
# gapfill_gdata = gw.read_h5_file("20250708_visiumHD_benchmark/GapFill_GIFTWRAP_WITH_WTA_TEST_UPDATE/counts.1.h5")
gapfill_gdata = gw.read_h5_file("20250708_visiumHD_benchmark/GapFill_GIFTWRAP_WITH_WTA_TEST_UPDATE_IGO/counts.1.h5")
# dual_probe_gdata= gw.read_h5_file("20250708_visiumHD_benchmark/GapFill_GIFTWRAP_WITH_WTA_TEST_UPDATE_IGO/counts.1.h5")

# Next the WTAs
dual_probe_wta = sio.visium_hd(
    "20250708_visiumHD_benchmark/CL_DualPanel_WTA/outs",
     dataset_id=''
)
# Convert to ZARR
dual_probe_wta.write("./dual_probe_wta.zarr", overwrite=True)
# Re-read
del dual_probe_wta
gc.collect()
dual_probe_wta = sd.read_zarr("./dual_probe_wta.zarr")
gapfill_wta = sio.visium_hd(
    "20250708_visiumHD_benchmark/CL_GapFill_WTA/outs",
     dataset_id=''
)
# Convert to ZARR
gapfill_wta.write("./gapfill_wta.zarr", overwrite=True)
# Re-read
del gapfill_wta
gc.collect()
gapfill_wta = sd.read_zarr("./gapfill_wta.zarr")

# gapfill_gdata = gw.read_h5_file("/data1/lareauc/projects/gapfill/analysis/20250226_visiumHD/20250224_M81S_GF_CL_GIFTWRAP_REDUX/counts.1.h5")
# gapfill_wta = sio.visium_hd(
#     "/data1/lareauc/projects/gapfill/analysis/20250226_visiumHD/Visium_CL_WTA/outs",
#      dataset_id=''
# )

# From: https://github.com/clareaulab/gift_reproducibility/blob/main/figure_CL_dual/data/3cl_predicted_genotypes.csv
celltype_genotypes_df = pd.read_csv("cell_line_data.csv", index_col=0)
celltype_genotypes_df

In [None]:
dual_probe_gdata.X = dual_probe_gdata.layers['X_pcr_threshold_5']
gapfill_gdata.X = gapfill_gdata.layers['X_pcr_threshold_5']

In [None]:
# Map genotypes
annotated_genotypes = celltype_genotypes_df.name.unique().tolist()
wt_alleles = dict()
alt_alleles = dict()
celltype_genotypes = {
    "HEL": dict(),
    "K562": dict(),
    "SET2": dict(),
}
celltype_annotated = {
    "HEL": dict(),
    "K562": dict(),
    "SET2": dict()
}
for i, row in celltype_genotypes_df.iterrows():
    wt_alleles[row['name']] = row["gapfill_from_transcriptome"]
    alt_alleles[row['name']] = row["gap_probe_sequence"]
    if row['genotype_from_bulk'] == 'heterozygous':
        celltype_annotated[row['cell_type']][row["name"]] = "HET"
    elif row['genotype_from_bulk'] == 'homozygous_ref':
        celltype_annotated[row['cell_type']][row["name"]] = "REF"
    elif row['genotype_from_bulk'] == 'homozygous_alt':
        celltype_annotated[row['cell_type']][row["name"]] = "ALT"
    first_gf = row["0"]
    second_gf = row["1"]
    celltype_genotypes[row['cell_type']][row["name"]] = [first_gf]
    if not pd.isna(second_gf) and second_gf != "":
        celltype_genotypes[row['cell_type']][row["name"]].append(second_gf)

In [None]:
# Manually create polygons for cell lines
def numpy_to_shapely(poly):
    return Polygon([(x, y) for x, y in poly])

dual_probe_polygons = {
    "HEL": numpy_to_shapely(np.array([[9_000, 43_000], [35_000, 45_000], [35_000, 57_000], [9_000, 57_000]])),
    "K562": numpy_to_shapely(np.array([[9_000, 58_000], [22_000, 58_000], [22_000, 75_000], [9_000, 75_000]])),
    "SET2": numpy_to_shapely(np.array([[23_000, 58_000], [40_000, 58_000], [40_000, 75_000], [23_000, 75_000]])),
}

dual_probe_wta.tables['square_002um'].obs['cell_line'] = 'N/A'
dual_probe_wta.tables['square_008um'].obs['cell_line'] = 'N/A'
dual_probe_wta.tables['square_016um'].obs['cell_line'] = 'N/A'
for cell_line, poly in dual_probe_polygons.items():
    filtered = polygon_query(
        dual_probe_wta,
        polygon=poly,
        target_coordinate_system="",
    )
    dual_probe_wta.tables['square_002um'].obs.loc[filtered.tables['square_002um'].obs_names, 'cell_line'] = cell_line
    dual_probe_wta.tables['square_008um'].obs.loc[filtered.tables['square_008um'].obs_names, 'cell_line'] = cell_line
    dual_probe_wta.tables['square_016um'].obs.loc[filtered.tables['square_016um'].obs_names, 'cell_line'] = cell_line

gapfill_polygons = {
    # Reordered last two vertices
    "HEL": numpy_to_shapely(np.array([[10_000, 12_000], [40_000, 12_000], [40_000, 28_000], [10_000, 28_000]])),
    "K562": numpy_to_shapely(np.array([
        [10_000, 27_000], [19_000, 27_000], [19_000, 37_000],
        [19_500, 38_000], [19_000, 40_000], [10_000, 42_000],
        [10_000, 40_000]
    ])),
    "SET2": numpy_to_shapely(np.array([
        [33_500, 26_500], [19_500, 37_000], [19_000, 40_000],
        [18_000, 43_000], [33_500, 44_500]
    ])),
}

gapfill_wta.tables['square_002um'].obs['cell_line'] = 'N/A'
gapfill_wta.tables['square_008um'].obs['cell_line'] = 'N/A'
gapfill_wta.tables['square_016um'].obs['cell_line'] = 'N/A'
for cell_line, poly in gapfill_polygons.items():
    filtered = polygon_query(
        gapfill_wta,
        polygon=poly,
        target_coordinate_system="",
    )
    gapfill_wta.tables['square_002um'].obs.loc[filtered.tables['square_002um'].obs_names, 'cell_line'] = cell_line
    gapfill_wta.tables['square_008um'].obs.loc[filtered.tables['square_008um'].obs_names, 'cell_line'] = cell_line
    gapfill_wta.tables['square_016um'].obs.loc[filtered.tables['square_016um'].obs_names, 'cell_line'] = cell_line

#
# gapfill_polygons = {
#     "HEL": numpy_to_shapely(np.array([[5_000, 5_000], [35_000, 5_000], [35_000, 19_000], [5_000, 19_000]])),
#     "K562": numpy_to_shapely(np.array([[5_000, 20_000], [22_000, 20_000], [22_000, 40_000], [5_000, 40_000]])),
#     "SET2": numpy_to_shapely(np.array([[18_000, 19_000], [40_000, 19_000], [40_000, 40_000], [18_000, 40_000]])),
# }
#
# gapfill_wta.tables['square_002um'].obs['cell_line'] = 'N/A'
# gapfill_wta.tables['square_008um'].obs['cell_line'] = 'N/A'
# gapfill_wta.tables['square_016um'].obs['cell_line'] = 'N/A'
# for cell_line, poly in gapfill_polygons.items():
#     filtered = polygon_query(
#         gapfill_wta,
#         polygon=poly,
#         target_coordinate_system="",
#     )
#     gapfill_wta.tables['square_002um'].obs.loc[filtered.tables['square_002um'].obs_names, 'cell_line'] = cell_line
#     gapfill_wta.tables['square_008um'].obs.loc[filtered.tables['square_008um'].obs_names, 'cell_line'] = cell_line
#     gapfill_wta.tables['square_016um'].obs.loc[filtered.tables['square_016um'].obs_names, 'cell_line'] = cell_line

In [None]:
# Crop to tissue area
dual_probe_wta = dual_probe_wta.query.bounding_box(
    axes=("x", "y"),
    min_coordinate=np.array([0, 30_000]),
    max_coordinate=np.array([40_000, 80_000]),
    target_coordinate_system="",
    filter_table=False
)
dual_probe_wta

In [None]:
# Crop to tissue area
gapfill_wta = gapfill_wta.query.bounding_box(
    axes=("x", "y"),
    min_coordinate=np.array([0, 0]),
    max_coordinate=np.array([40_000, 45_000]),
    target_coordinate_system="",
    filter_table=False
)
gapfill_wta

In [None]:
plot_HE(dual_probe_wta)

In [None]:
plot_HE(gapfill_wta)

In [None]:
# Plot the tissue with the annotations
ax = dual_probe_wta.pl.render_images(f"_hires_image", alpha=0.8) \
    .pl.render_shapes(element=f'_square_{RESOLUTION:03d}um', color='cell_line', na_color=None) \
    .pl.show(coordinate_systems="", figsize=(25, 25), na_in_legend=False, title="Dual Probe Selection", return_ax=True)

# Remove the x and y ticks, tick labels
ax.set_xticks([])
ax.set_yticks([])
# Rename the x axis and y axis
ax.set_xlabel("Spatial 1")
ax.set_ylabel("Spatial 2")

ax

In [None]:
# Plot the tissue with the annotations
ax = gapfill_wta.pl.render_images(f"_hires_image", alpha=0.8) \
    .pl.render_shapes(element=f'_square_{RESOLUTION:03d}um', color='cell_line', na_color=None) \
    .pl.show(coordinate_systems="", figsize=(25, 25), na_in_legend=False, title="GapFill Probe Selection", return_ax=True)

# Remove the x and y ticks, tick labels
ax.set_xticks([])
ax.set_yticks([])
# Rename the x axis and y axis
ax.set_xlabel("Spatial 1")
ax.set_ylabel("Spatial 2")

ax

In [None]:
dual_probe_gdata

In [None]:
gapfill_gdata

In [None]:
dual_probe_wta

In [None]:
gapfill_wta

# Genotyping

We will do basic preprocessing and genotyping calls on the GIFT-seq datasets

## First, the dual-probe dataset

In [None]:
# We don't expect any gapfills, so we should filter out UMIs with a gapfill
probes_with_gapfill = dual_probe_gdata.var[dual_probe_gdata.var['gapfill'] != ''].index
# Count number of probes and the number of umis this affects
print(f"Number of probes with gapfill: {len(probes_with_gapfill)} / {dual_probe_gdata.var.shape[0]}")
umis_with_gapfill = np.sum(dual_probe_gdata[:, probes_with_gapfill].X)
print(f"Number of UMIs with gapfill: {umis_with_gapfill} / {dual_probe_gdata.X.sum()}")
# Drop them
dual_probe_gdata = dual_probe_gdata[:, dual_probe_gdata.var['gapfill'] == '']
dual_probe_gdata

In [None]:
# dual_probe_gdata.X = dual_probe_gdata.layers['X_pcr_threshold_5']

Since the dual-probe dataset isn't formatted correctly for genotyping, we will recreate the AnnData to look more like a vanilla GIFT-seq dataset

In [None]:
# Select all the 0bp control probes, these will not be used for genotyping
zerobp_probes = dual_probe_gdata.var[dual_probe_gdata.var.probe.str.contains("0bp")].probe.values
mut_wt_pairs = []
# Now for each non 0bp probe, we will find the mutant probe and its associated WT probe
for probe in dual_probe_gdata.var.probe.values:
    if "0bp" in probe:
        continue
    if ">" in probe:
        orig = probe.split(">")[0]
        alt = probe.split(">")[1]
        if orig[-1] == alt:  # This was a WT probe
            continue
        wt_probe = f"{orig}>{orig[-1]}"
        mut_wt_pairs.append((probe, wt_probe, alt, orig[-1]))
    else:
        print(f"Unexpected probe name: {probe}")

# Manually add the probes that were not named correctly
mut_wt_pairs.append(("BCR-ABL c.fusion", "BCR-ABL null", 'fusion', 'null'))
mut_wt_pairs.append(("TP53 c.405insC", "TP53 c.405", "insC", "ref"))

print(f"Dropping the following probes that could not be paired: {set(dual_probe_gdata.var.probe) - set(zerobp_probes) - set([a for a, b,c,d in mut_wt_pairs]) - set([b for a, b,c,d in mut_wt_pairs])}")
# Create a new AnnData with just the probes we want
new_var = dict(
    probe=[],
    gene=[],
    gapfill=[],
)
new_X = np.zeros((dual_probe_gdata.n_obs, 2 * len(mut_wt_pairs) + zerobp_probes.shape[0]), dtype=np.int32)
new_layers = {k: np.zeros((dual_probe_gdata.n_obs, 2 * len(mut_wt_pairs) + zerobp_probes.shape[0]), dtype=np.int32) for k in dual_probe_gdata.layers.keys() if 'X' in k and int(k.split("_")[-1]) < 25}
for i, probe in enumerate(zerobp_probes):
    idx = dual_probe_gdata.var.index[dual_probe_gdata.var.probe == probe][0]
    new_var['probe'].append(probe)
    new_var['gene'].append(dual_probe_gdata.var.loc[idx, 'gene'])
    new_var['gapfill'].append(dual_probe_gdata.var.loc[idx, 'gapfill'])
    new_X[:, i] = dual_probe_gdata[:, idx].X.toarray().flatten()
    for k in list(new_layers.keys()):
        new_layers[k][:, i] = dual_probe_gdata[:, idx].layers[k].toarray().flatten()
for j, (mut, wt, mut_genotype, wt_genotype) in enumerate(mut_wt_pairs):
    wt_idx = dual_probe_gdata.var.index[dual_probe_gdata.var.probe == wt][0]
    mut_idx = dual_probe_gdata.var.index[dual_probe_gdata.var.probe == mut][0]
    new_var['probe'].append(mut)
    new_var['gene'].append(dual_probe_gdata.var.loc[wt_idx, 'gene'])
    new_var['gapfill'].append(wt_genotype)
    new_X[:, len(zerobp_probes) + 2 * j] = dual_probe_gdata[:, wt_idx].X.toarray().flatten()
    new_var['probe'].append(mut)
    new_var['gene'].append(dual_probe_gdata.var.loc[wt_idx, 'gene'])
    new_var['gapfill'].append(mut_genotype)
    new_X[:, len(zerobp_probes) + 2 * j + 1] = dual_probe_gdata[:, mut_idx].X.toarray().flatten()
    for k in list(new_layers.keys()):
        new_layers[k][:, len(zerobp_probes) + 2 * j] = dual_probe_gdata[:, wt_idx].layers[k].toarray().flatten()
        new_layers[k][:, len(zerobp_probes) + 2 * j + 1] = dual_probe_gdata[:, mut_idx].layers[k].toarray().flatten()

var_df = pd.DataFrame(new_var)
var_df['probe_gapfill'] = var_df['probe'] + "_" + var_df['gapfill']
var_df = var_df.set_index('probe_gapfill')
dual_probe_gdata = ad.AnnData(
    X=new_X,
    obs=dual_probe_gdata.obs.copy(),
    var=var_df,
    uns=dual_probe_gdata.uns.copy(),
    layers=new_layers
)
dual_probe_gdata

In [None]:
# Genotype
dual_probe_gdata = gw.tl.call_genotypes(
    dual_probe_gdata
)
dual_probe_gdata

In [None]:
# Join with the spatialdata
dual_probe_sdata = gw.sp.join_with_wta(dual_probe_wta, dual_probe_gdata)
dual_probe_sdata

In [None]:
print_summary_stats(dual_probe_sdata, RESOLUTION, include_0bp=True)

In [None]:
print_summary_stats(dual_probe_sdata, RESOLUTION, include_0bp=False)

## Now, compute the genotyping efficiency metrics

In [None]:
# Plot the WTA library size
plot_library_size(dual_probe_sdata, table='', resolution=RESOLUTION)

In [None]:
# Plot the GIFT-seq library size
plot_library_size(dual_probe_sdata, table='gf', resolution=RESOLUTION)

In [None]:
plot_library_size(dual_probe_sdata, table='gf', resolution=RESOLUTION, include_0bp=True)

In [None]:
plot_sites_genotyped(dual_probe_sdata, resolution=RESOLUTION)

In [None]:
plot_sites_genotyped(dual_probe_sdata, resolution=RESOLUTION, at_least_one=True)

In [None]:
compare_library_size_per_bin(dual_probe_sdata, resolution=RESOLUTION)

In [None]:
compare_library_size_per_bin(dual_probe_sdata, resolution=RESOLUTION, include_0bp=True)

In [None]:
plot_relative_efficiency(dual_probe_sdata, resolution=RESOLUTION, min_0bp_count=30_000, min_gf_count=20_000)

In [None]:
plot_genotype_umi_comparison(dual_probe_sdata, 'K562', 'HEL',
    annotated_genotypes,
    celltype_genotypes,
    wt_alleles,
    alt_alleles,
    RESOLUTION
)

In [None]:
plot_genotype_umi_comparison(dual_probe_sdata, 'K562', 'SET2',
    annotated_genotypes,
    celltype_genotypes,
    wt_alleles,
    alt_alleles,
    RESOLUTION
)

In [None]:
plot_genotype_umi_comparison(dual_probe_sdata, 'SET2', 'HEL',
    annotated_genotypes,
    celltype_genotypes,
    wt_alleles,
    alt_alleles,
    RESOLUTION
)

In [None]:
pseudobulk_genotype_table(dual_probe_sdata, 'JAK2 c.1849G>T', "G", "T", {
    "K562": "WT",
    "HEL": "ALT",
    "SET2": "HET"
})

In [None]:
# dual_probe_sdata.tables['gf_square_002um'].X = dual_probe_sdata.tables['gf_square_002um'].layers['X_pcr_threshold_5']  # Adjust PCR threshold
# dual_probe_sdata.tables['gf_square_008um'].X = dual_probe_sdata.tables['gf_square_008um'].layers['X_pcr_threshold_5']
# dual_probe_sdata.tables['gf_square_016um'].X = dual_probe_sdata.tables['gf_square_016um'].layers['X_pcr_threshold_5']

In [None]:
genotype_psuedobulk_accuracy_by_pcr(dual_probe_sdata, 'JAK2 c.1849G>T', "G", "T", {
    "K562": "WT",
    "HEL": "ALT",
    "SET2": "HET"
}, ['K562', 'HEL'], max_threshold=25)

In [None]:
plot_library_specific_probe(dual_probe_sdata, 'JAK2 c.1849G>T', "G", RESOLUTION)

In [None]:
plot_library_specific_probe(dual_probe_sdata, 'JAK2 c.1849G>T', "T", RESOLUTION)

In [None]:
genotype_cell_line_barplots(dual_probe_sdata, 'JAK2 c.1849G>T', "G", "T", RESOLUTION)

In [None]:
genotype_accuracy_barplot(dual_probe_sdata, 'JAK2 c.1849G>T', "G", "T", {
    "K562": "WT",
    "HEL": "ALT",
    "SET2": "HET"
}, RESOLUTION)

In [None]:
genotype_accuracy_barplot(dual_probe_sdata, 'JAK2 c.1849G>T', "G", "T", {
    "K562": "WT",
    "HEL": "ALT",
    "SET2": "HET"
}, RESOLUTION, filter_na=False)

In [None]:
for probe in dual_probe_gdata.var.probe.unique():
    if probe not in annotated_genotypes:
        continue
    wt_allele = "" if ">" not in probe else probe.split(">")[0][-1]
    alt_allele = "" if ">" not in probe else probe.split(">")[1]
    ct_dict = {
        "HEL": "HET" if len(celltype_genotypes["HEL"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["HEL"][probe] else "ALT"),
        "K562": "HET" if len(celltype_genotypes["K562"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["K562"][probe] else "ALT"),
        "SET2": "HET" if len(celltype_genotypes["SET2"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["SET2"][probe] else "ALT"),
    }
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))
    genotype_cell_line_barplots(dual_probe_sdata, probe, wt_allele, alt_allele, RESOLUTION, ax=axes[0])
    genotype_accuracy_barplot(dual_probe_sdata, probe, wt_allele, alt_allele, ct_dict, RESOLUTION, filter_na=True, ax=axes[1])
    plt.suptitle(probe)
    plt.tight_layout()
    plt.show()
    plt.clf()

## Now some basic visualization and analysis

In [None]:
# Sort probes by capture rate and plot each on the spatial coords
probes = dual_probe_gdata.var.probe[~dual_probe_gdata.var.probe.str.contains("0bp")]
probes = sorted(probes, key=lambda x: dual_probe_gdata[:, dual_probe_gdata.var.probe == x].X.sum(), reverse=True)

for probe in probes:
    print(f"Plotting {probe}")
    plot_genotypes(dual_probe_sdata, probe, resolution=RESOLUTION)
    plt.show()
    plt.clf()

In [None]:
# Cluster
dual_probe_sdata = gw.sp.recipe_spatial_expression_coclustering(
    dual_probe_sdata,
    table_name=f"square_{RESOLUTION:03d}um",
    n_highly_variable_genes=1000
)
dual_probe_sdata

In [None]:
# Impute genotypes
dual_probe_sdata = gw.sp.impute_genotypes(
    dual_probe_sdata,
    cluster_key='spatio_expression_coclustering',
    resolution=f'square_{RESOLUTION:03d}um',
    hold_out=0.05
)
dual_probe_sdata

In [None]:
# Plot imputed genotypes
for probe in probes:
    print(f"Plotting {probe}")
    plot_genotypes(dual_probe_sdata, probe, resolution=RESOLUTION, imputed=True)
    plt.show()
    plt.clf()

# Now, the GapFill Dataset

## First, we will do some basic preprocessing

In [None]:
gw.pp.filter_gapfills(gapfill_gdata, min_cells=10)
gapfill_gdata = gw.tl.call_genotypes(
    gapfill_gdata
)
# gapfill_gdata.X = gapfill_gdata.X - gapfill_gdata.layers['X_pcr_threshold_5']  To isolate filtered out umis
gapfill_gdata

In [None]:
gapfill_gdata.obsm['genotype']['JAK2 c.1849G>T'].value_counts()

In [None]:
gapfill_sdata = gw.sp.join_with_wta(gapfill_wta, gapfill_gdata)
gapfill_sdata

In [None]:
print_summary_stats(gapfill_sdata, RESOLUTION, include_0bp=True)

In [None]:
print_summary_stats(gapfill_sdata, RESOLUTION, include_0bp=False)

## Now, compute the genotyping efficiency metrics

In [None]:
# Plot the WTA library size
plot_library_size(gapfill_sdata, table='', resolution=RESOLUTION)

In [None]:
# Plot the GIFT-seq library size
plot_library_size(gapfill_sdata, table='gf', resolution=RESOLUTION)

In [None]:
plot_library_size(gapfill_sdata, table='gf', resolution=RESOLUTION, include_0bp=True)

In [None]:
plot_sites_genotyped(gapfill_sdata, resolution=RESOLUTION)

In [None]:
plot_sites_genotyped(gapfill_sdata, resolution=RESOLUTION, at_least_one=True)

In [None]:
compare_library_size_per_bin(gapfill_sdata, resolution=RESOLUTION)

In [None]:
compare_library_size_per_bin(gapfill_sdata, resolution=RESOLUTION, include_0bp=True)

In [None]:
plot_relative_efficiency(gapfill_sdata, resolution=RESOLUTION, min_0bp_count=30_000, min_gf_count=20_000)

In [None]:
plot_genotype_umi_comparison(gapfill_sdata, 'K562', 'HEL',
                             annotated_genotypes,
                             celltype_genotypes,
                             wt_alleles,
                             alt_alleles,
                             RESOLUTION
                             )

In [None]:
plot_genotype_umi_comparison(gapfill_sdata, 'K562', 'SET2',
                             annotated_genotypes,
                             celltype_genotypes,
                             wt_alleles,
                             alt_alleles,
                             RESOLUTION
                             )

In [None]:
plot_genotype_umi_comparison(gapfill_sdata, 'SET2', 'HEL',
                             annotated_genotypes,
                             celltype_genotypes,
                             wt_alleles,
                             alt_alleles,
                             RESOLUTION
                             )

In [None]:
pseudobulk_genotype_table(gapfill_sdata, 'JAK2 c.1849G>T', "CAC", "AAC", {
    "K562": "WT",
    "HEL": "ALT",
    "SET2": "HET"
})

In [None]:
genotype_psuedobulk_accuracy_by_pcr(gapfill_sdata, 'JAK2 c.1849G>T', "CAC", "AAC", {
    "K562": "WT",
    "HEL": "ALT",
    "SET2": "HET"
}, ['K562', 'HEL'], max_threshold=25)

In [None]:
for probe in gapfill_gdata.var.probe.unique():
    if probe not in annotated_genotypes:
        continue
    wt_allele = wt_alleles[probe]
    alt_allele = alt_alleles[probe]
    ct_dict = {
        "HEL": "HET" if len(celltype_genotypes["HEL"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["HEL"][probe] else "ALT"),
        "K562": "HET" if len(celltype_genotypes["K562"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["K562"][probe] else "ALT"),
        "SET2": "HET" if len(celltype_genotypes["SET2"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["SET2"][probe] else "ALT"),
    }
    genotype_psuedobulk_accuracy_by_pcr(gapfill_sdata, probe, wt_allele, alt_allele, ct_dict, [ct for ct, geno in ct_dict.items() if geno != "HET"], max_threshold=15)
    print(probe)
    print("---------------------------")

In [None]:
from IPython.display import display
for probe in gapfill_gdata.var.probe.unique():
    probe_norm = probe.split("|")
    if len(probe_norm) > 1:
        probe_norm = " ".join(probe_norm[1:3])
    else:
        probe_norm = probe
    if probe_norm not in annotated_genotypes:
        continue
    wt_allele = wt_alleles[probe_norm]
    alt_allele = alt_alleles[probe_norm]
    ct_dict = {
        "HEL": "HET" if len(celltype_genotypes["HEL"][probe_norm]) > 1 else ("WT" if wt_alleles[probe_norm] in celltype_genotypes["HEL"][probe_norm] else "ALT"),
        "K562": "HET" if len(celltype_genotypes["K562"][probe_norm]) > 1 else ("WT" if wt_alleles[probe_norm] in celltype_genotypes["K562"][probe_norm] else "ALT"),
        "SET2": "HET" if len(celltype_genotypes["SET2"][probe_norm]) > 1 else ("WT" if wt_alleles[probe_norm] in celltype_genotypes["SET2"][probe_norm] else "ALT"),
    }
    # try:
    df = pseudobulk_genotype_table(gapfill_sdata, probe, wt_allele, alt_allele, ct_dict)
    display(df)
    # except: continue
    print(probe)
    print("---------------------------")

In [None]:
genotype_cell_line_barplots(gapfill_sdata, 'JAK2 c.1849G>T', "CAC", "AAC", RESOLUTION)

In [None]:
genotype_accuracy_barplot(gapfill_sdata, 'JAK2 c.1849G>T', "CAC", "AAC", {
    "K562": "WT",
    "HEL": "ALT",
    "SET2": "HET"
}, RESOLUTION)

In [None]:
genotype_accuracy_barplot(gapfill_sdata, 'JAK2 c.1849G>T', "CAC", "AAC", {
    "K562": "WT",
    "HEL": "ALT",
    "SET2": "HET"
}, RESOLUTION, filter_na=False)

In [None]:
for probe in gapfill_gdata.var.probe.unique():
    if probe not in annotated_genotypes:
        continue
    wt_allele = wt_alleles[probe]
    alt_allele = alt_alleles[probe]
    ct_dict = {
        "HEL": "HET" if len(celltype_genotypes["HEL"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["HEL"][probe] else "ALT"),
        "K562": "HET" if len(celltype_genotypes["K562"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["K562"][probe] else "ALT"),
        "SET2": "HET" if len(celltype_genotypes["SET2"][probe]) > 1 else ("WT" if wt_alleles[probe] in celltype_genotypes["SET2"][probe] else "ALT"),
    }
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))
    genotype_cell_line_barplots(gapfill_sdata, probe, wt_allele, alt_allele, RESOLUTION, ax=axes[0])
    genotype_accuracy_barplot(gapfill_sdata, probe, wt_allele, alt_allele, ct_dict, RESOLUTION, filter_na=True, ax=axes[1])
    plt.suptitle(probe)
    plt.tight_layout()
    plt.show()
    plt.clf()

## Now some basic visualization and analysis

In [None]:
# Sort probes by capture rate and plot each on the spatial coords
probes = gapfill_gdata.var.probe[~gapfill_gdata.var.probe.str.contains("0bp")].unique()
probes = sorted(probes, key=lambda x: gapfill_gdata[:, gapfill_gdata.var.probe == x].X.sum(), reverse=True)

for probe in probes:
    print(f"Plotting {probe}")
    plot_genotypes(gapfill_sdata, probe, resolution=RESOLUTION)
    plt.show()
    plt.clf()

In [None]:
# Cluster
gapfill_sdata = gw.sp.recipe_spatial_expression_coclustering(
    gapfill_sdata,
    table_name=f"square_{RESOLUTION:03d}um",
    n_highly_variable_genes=1000,
    coordinate_system=""
)
gapfill_sdata

In [None]:
# Impute genotypes
gapfill_sdata = gw.sp.impute_genotypes(
    gapfill_sdata,
    cluster_key='spatio_expression_coclustering',
    resolution=f'square_{RESOLUTION:03d}um',
    hold_out=0.05
)
gapfill_sdata

In [None]:
# Plot imputed genotypes
for probe in probes:
    print(f"Plotting {probe}")
    plot_genotypes(gapfill_sdata, probe, resolution=RESOLUTION, imputed=True)
    plt.show()
    plt.clf()

In [None]:
boxplot_of_dualprobe_vs_gapfill(
    dual_probe_sdata,
    gapfill_sdata,
    annotated_genotypes,
    celltype_genotypes,
    wt_alleles,
    alt_alleles,
    RESOLUTION
)