In [None]:
# Import packages
import warnings
from pathlib import Path

import anndata as ad
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import seaborn as sns
import skimage
import yaml
from matplotlib.colors import LinearSegmentedColormap
from phenoscapes.feature_extraction import extract_features
from phenoscapes.montage import generate_overview_montage
from phenoscapes.sc import convert_to_h5ad, plot_summary
from phenoscapes.utils import annotate_img, get_metadata, scale_image
from skimage import io
from skimage.color import label2rgb
from tqdm import tqdm

warnings.simplefilter(action="ignore", category=Warning)
np.random.seed(0)

import os

import matplotlib
from numba import config as config_numba

config_numba.CPU_NAME = "generic"

In [None]:
dir_output = "/cluster/project/treutlein/DATA/imaging/4i_Data/Brain_ECM_4i_2_v2/"
ann_output = Path(dir_output, "anndata_combined")
ann_output.mkdir(parents=True, exist_ok=True)

dir_bg_subtracted = Path(dir_output, "bg_subtracted")
ann_dir = Path(dir_output, "anndata")
overview = pd.read_csv("updated_brain_ECM_2.csv")
overview["id"] = overview["Condition"] + overview["Day"].astype(str)
overview["unqiue_id"] = (
    overview["Number"].astype(str)
    + " "
    + overview["Block"].astype(str)
    + " "
    + overview["Condition"]
    + " "
    + overview["Day"].astype(str)
)

morpho_dir = Path(dir_output, "feature_tables_morphometrics_cells")
ann_dir_cytoplasma = Path(dir_output, "anndata_mean_cytoplasma")
ann_dir_ecm_niche = Path(dir_output, "anndata_mean_ecm_niche")
ann_dir_nuclei = Path(dir_output, "anndata_mean_cell_nuclei")

samples = [  # Day 7
    "R066_0",
    "R062_0",
    "R073_0",
    "R072_0",
    # Day 15
    "R026_0",
    "R022_0",
    "R024_0",
    "R049_0",
    "R055_0",
    "R044_0",
    "R057_0",
    # Day 21
    "R076_1",
    "R087_0",
    "R082_0",
    "R108_1",
    "R112_1",
    "R113_1",
    "R115_0",
]


ecm_genes = [
    "COL4A1",
    "Col4A1",
    "Col2A1",
    "Fibronectin",
    "Laminin",
    "WNT5a",
    "IGFBP2",
    "HAPLN1",
    "VCAN",
    "SFRP2",
]

cytoplamsa_membrane = [
    "Cytokeratin",
    "TUBB3",
    "pVIM",
    "B-cat",
    "Human",
    "ITGA5B1",
    "Vinculin",
    "Nes",
    "WNT5a",
    "IGFBP2",
    "Arl13b",
    "Piezo1",
    "Scribble",
    "FLNA",
    "RSPO3",
    "n-Cad",
    "CDH1",
    "ITGB5",
    "GPR177WLS",
    "DLL1",
    "VIM",
    "JAG1",
    "GPC3",
    "NUMB",
    "Vangl2",
    "SFRP2",
    "PROM1CD133",
]

nuclear = [
    "OTX2",
    "PAX37",
    "OCT4",
    "PH3S0",
    "SOX2",
    "SOX10",
    "NR2F1COUP=TFI",
    "TBR1",
    "HES1",
    "GSX2",
    "Gli3",
    "PAX3",
    "RAX",
    "Pax6",
    "Sox21",
    "HES4",
    "Vangl2",
]

In [None]:
# Get protein intensities from the different compartements
anndatas_compartements = []
for ann_dir, compartement, good_stains in zip(
    [ann_dir_cytoplasma, ann_dir_nuclei],
    ["cytoplasm", "nuclei"],
    [cytoplamsa_membrane, nuclear],
):
    anndatas = []
    for sample in samples:
        number = int(sample.split("_")[0].split("R")[1])

        if (overview["Number"] == number).any():
            adata_org = ad.read_h5ad(Path(ann_dir, f"{sample}.h5ad"))
            adata_org = adata_org[adata_org.obs["area"] > 50]
            df = pd.read_csv(Path(morpho_dir, f"{sample}.csv"))
            df = df[df["label"].isin(adata_org.obs["ID"])]

            adata_org = adata_org[adata_org.obs["ID"].isin(df["label"])]
            assert (np.array(df["label"]) == np.array(adata_org.obs["ID"])).all()

            adata_org.obs["Day"] = overview[overview["Number"] == number].iloc[0]["Day"]
            adata_org.obs["Condition"] = overview[overview["Number"] == number].iloc[0][
                "Condition"
            ]
            adata_org.obs["Block"] = overview[overview["Number"] == number].iloc[0][
                "Block"
            ]
            adata_org.obs["sample_num"] = overview[overview["Number"] == number].iloc[
                0
            ]["Number"]
            adata_org.obs["sample"] = sample

            for feature in df.columns:
                adata_org.obs[feature] = np.array(df[feature])

            anndatas.append(adata_org)

    adata = ad.concat(anndatas)
    stain_vector = [x in good_stains for x in adata.var_names]
    adata = adata[:, stain_vector]

    # Unique id for vars
    adata.var_names = np.array(adata.var_names) + "_" + compartement

    # Unique id for obs
    adata.obs["unique_id"] = adata.obs["ID"].astype(str) + adata.obs["sample"].astype(
        str
    )
    adata.obs_names = adata.obs["unique_id"]
    anndatas_compartements.append(adata)

adata = ad.concat(anndatas_compartements, axis=1, merge="same")
assert (len(cytoplamsa_membrane) + len(nuclear)) == len(adata.var_names)
adata.obs["sample_num_str"] = adata.obs["sample_num"].astype(str)

In [None]:
# plot WLS, YAP expression
import matplotlib

matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42
color_pallete_perturbation = {"Matrigel": "#17ad97", "No Matrix": "#4d4d4d"}
wls_time_course = pd.DataFrame()
wls_time_course["Log Expression"] = adata[:, "GPR177WLS_cytoplasm"].X[:, 0]
wls_time_course["Condition"] = np.array(adata.obs["Condition"])
wls_time_course["Day"] = np.array(adata.obs["Day"])
wls_time_course = wls_time_course[wls_time_course["Day"] == 15]
wls_time_course["protein"] = "WLS"
yap_wls_time_course = pd.read_csv(
    "/cluster/project/treutlein/USERS/gutgi/gitlab/morphodynamics-of-human-brain-organoid-patterning/4i_analysis/Brain_ECM_1/DAPI/yap_mean_day_16.csv"
)

yap_wls_time_course = pd.concat([yap_wls_time_course, wls_time_course])

fig, ax = plt.subplots(figsize=(2, 2))
sns.despine(left=True, bottom=True, right=True)

sns.violinplot(
    yap_wls_time_course,
    x="protein",
    y=f"Log Expression",
    split=True,
    palette=color_pallete_perturbation,
    gap=0.05,
    hue="Condition",
    cut=0,
).legend(loc="center left", bbox_to_anchor=(1.0, 0.5), fontsize=16)
ax.set(ylim=(0, 8))
fig.savefig(f"violin_plot_Yap1_WLS_log_mean_expression_day_16.pdf", bbox_inches="tight")
# plt.close()

In [None]:
import matplotlib

matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42
color_pallete_perturbation = {"Matrigel": "#17ad97", "No Matrix": "#4d4d4d"}
wls_time_course = pd.DataFrame()
wls_time_course["Log Expression"] = adata[:, "GPR177WLS_cytoplasm"].X[:, 0]
wls_time_course["Condition"] = np.array(adata.obs["Condition"])
wls_time_course["Day"] = np.array(adata.obs["Day"])
wls_time_course = wls_time_course[wls_time_course["Day"] == 15]
wls_time_course["protein"] = "WLS \n day 16"
yap_wls_time_course = pd.read_csv(
    "/cluster/project/treutlein/USERS/gutgi/gitlab/morphodynamics-of-human-brain-organoid-patterning/4i_analysis/Brain_ECM_1/DAPI/yap_mean_day_11.csv"
)
yap_wls_time_course["protein"] = "YAP1 \n day 11"

yap_wls_time_course = pd.concat([yap_wls_time_course, wls_time_course])

fig, ax = plt.subplots(figsize=(2, 2))
sns.despine(left=True, bottom=True, right=True)

sns.violinplot(
    yap_wls_time_course,
    x="protein",
    y=f"Log Expression",
    split=True,
    palette=color_pallete_perturbation,
    gap=0.05,
    hue="Condition",
    cut=0,
).legend(loc="center left", bbox_to_anchor=(1.0, 0.5), fontsize=16)
# ax.set(ylim=(0, 10))
fig.savefig(
    f"violin_plot_Yap1_day_11_WLS_day_16_log_mean_expression.svg", bbox_inches="tight"
)
plt.close()

In [None]:
# Plot expression of all proteins
color_pallete_perturbation = {"Matrigel": "#17ad97", "No Matrix": "#4d4d4d"}
for protein in adata.var_names:
    time_course = pd.DataFrame()
    if protein == "Human_cytoplasm":
        protein_2 = "Human_ITGB5_cytoplasm"
        time_course[f"Normalized expression \n {protein_2}"] = adata[:, protein].X[:, 0]
        time_course["Condition"] = np.array(adata.obs["Condition"])
        time_course["Day"] = np.array(adata.obs["Day"])
        fig, ax = plt.subplots(figsize=(5, 3))
        sns.despine(left=True, bottom=True, right=True)

        sns.violinplot(
            time_course,
            x="Day",
            y=f"Normalized expression \n {protein_2}",
            split=True,
            gap=0.05,
            hue="Condition",
            cut=0,
            palette=color_pallete_perturbation,
        ).legend(loc="center left", bbox_to_anchor=(1.0, 0.5), fontsize=16)
        fig.savefig(
            f"figures/expression_violin_plots/violin_plot_{protein_2}.svg",
            bbox_inches="tight",
        )
        plt.close()
    else:
        time_course[f"Normalized expression \n {protein}"] = adata[:, protein].X[:, 0]
        time_course["Condition"] = np.array(adata.obs["Condition"])
        time_course["Day"] = np.array(adata.obs["Day"])
        fig, ax = plt.subplots(figsize=(5, 3))
        sns.despine(left=True, bottom=True, right=True)

        sns.violinplot(
            time_course,
            x="Day",
            y=f"Normalized expression \n {protein}",
            split=True,
            palette=color_pallete_perturbation,
            gap=0.05,
            hue="Condition",
            cut=0,
        ).legend(loc="center left", bbox_to_anchor=(1.0, 0.5), fontsize=16)

        fig.savefig(
            f"figures/expression_violin_plots/violin_plot_{protein}.svg",
            bbox_inches="tight",
        )
        plt.close()

In [None]:
# Dimensionaly reduce + cluster data set
lower_perc = np.percentile(adata.X, 5, axis=0)
upper_perc = np.percentile(adata.X, 98, axis=0)
for perc_low, perc_high, row in zip(lower_perc, upper_perc, range(adata.X.shape[1])):
    adata.X[:, row] = adata.X[:, row].clip(perc_low, perc_high)

adata.obs["Day"] = adata.obs["Day"].astype(str)
sc.pp.scale(adata)
sc.tl.pca(adata)
sc.pl.pca_variance_ratio(adata)
sc.pp.neighbors(adata, n_pcs=10, use_rep="X_pca")
sc.tl.leiden(adata, resolution=0.9)
sc.tl.umap(adata, min_dist=0.2)

adata.write_h5ad(Path(dir_output, f"brain_ecm_4i_2_v2_mean.h5ad"))