In [None]:
# Import packages
import warnings
from pathlib import Path

import anndata as ad
import matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import seaborn as sns
import skimage
import yaml

# P value
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
from phenoscapes.feature_extraction import extract_features
from phenoscapes.montage import generate_overview_montage
from phenoscapes.sc import convert_to_h5ad, plot_summary
from phenoscapes.utils import annotate_img, get_metadata, scale_image
from scipy.stats import f_oneway, fisher_exact, mannwhitneyu, ranksums
from skimage import io
from skimage.color import label2rgb
from tqdm import tqdm

warnings.simplefilter(action="ignore", category=Warning)
np.random.seed(0)

import os

import matplotlib
from numba import config as config_numba
from statsmodels.stats.multitest import multipletests

config_numba.CPU_NAME = "generic"

In [None]:
dir_output = "/cluster/project/treutlein/DATA/imaging/4i_Data/Brain_ECM_4i_2_v2/"
ann_output = Path(dir_output, "anndata_combined")
ann_output.mkdir(parents=True, exist_ok=True)

dir_bg_subtracted = Path(dir_output, "bg_subtracted")
ann_dir = Path(dir_output, "anndata")
overview = pd.read_csv("updated_brain_ECM_2.csv")
overview["id"] = overview["Condition"] + overview["Day"].astype(str)
overview["unqiue_id"] = (
    overview["Number"].astype(str)
    + " "
    + overview["Block"].astype(str)
    + " "
    + overview["Condition"]
    + " "
    + overview["Day"].astype(str)
)

morpho_dir = Path(dir_output, "feature_tables_morphometrics_cells")
ann_dir_cytoplasma = Path(dir_output, "anndata_mean_cytoplasma")
ann_dir_ecm_niche = Path(dir_output, "anndata_mean_ecm_niche")
ann_dir_nuclei = Path(dir_output, "anndata_mean_cell_nuclei")

samples = [  # Day 7
    "R066_0",
    "R062_0",
    "R073_0",
    "R072_0",
    # Day 15
    "R026_0",
    "R022_0",
    "R024_0",
    "R049_0",
    "R055_0",
    "R044_0",
    "R057_0",
    # Day 21
    "R076_1",
    "R087_0",
    "R082_0",
    "R108_1",
    "R112_1",
    "R113_1",
    "R115_0",
]


ecm_genes = [
    "COL4A1",
    "Col4A1",
    "Col2A1",
    "Fibronectin",
    "Laminin",
    "WNT5a",
    "IGFBP2",
    "HAPLN1",
    "VCAN",
    "SFRP2",
]

cytoplamsa_membrane = [
    "Cytokeratin",
    "TUBB3",
    "pVIM",
    "B-cat",
    "Human",
    "ITGA5B1",
    "Vinculin",
    "Nes",
    "WNT5a",
    "IGFBP2",
    "Arl13b",
    "Piezo1",
    "Scribble",
    "FLNA",
    "RSPO3",
    "n-Cad",
    "CDH1",
    "ITGB5",
    "GPR177WLS",
    "DLL1",
    "VIM",
    "JAG1",
    "GPC3",
    "NUMB",
    "Vangl2",
    "SFRP2",
    "PROM1CD133",
]

nuclear = [
    "OTX2",
    "PAX37",
    "OCT4",
    "PH3S0",
    "SOX2",
    "SOX10",
    "NR2F1COUP=TFI",
    "TBR1",
    "HES1",
    "GSX2",
    "Gli3",
    "PAX3",
    "RAX",
    "Pax6",
    "Sox21",
    "HES4",
    "Vangl2",
]

In [None]:
# Get protein intensities from the different compartements
anndatas_compartements = []
for ann_dir, compartement, good_stains in zip(
    [ann_dir_cytoplasma, ann_dir_nuclei],
    ["cytoplasm", "nuclei"],
    [cytoplamsa_membrane, nuclear],
):
    anndatas = []
    for sample in samples:
        number = int(sample.split("_")[0].split("R")[1])

        if (overview["Number"] == number).any():
            adata_org = ad.read_h5ad(Path(ann_dir, f"{sample}.h5ad"))
            adata_org = adata_org[adata_org.obs["area"] > 50]
            df = pd.read_csv(Path(morpho_dir, f"{sample}.csv"))
            df = df[df["label"].isin(adata_org.obs["ID"])]

            adata_org = adata_org[adata_org.obs["ID"].isin(df["label"])]
            assert (np.array(df["label"]) == np.array(adata_org.obs["ID"])).all()

            adata_org.obs["Day"] = overview[overview["Number"] == number].iloc[0]["Day"]
            adata_org.obs["Condition"] = overview[overview["Number"] == number].iloc[0][
                "Condition"
            ]
            adata_org.obs["Block"] = overview[overview["Number"] == number].iloc[0][
                "Block"
            ]
            adata_org.obs["sample_num"] = overview[overview["Number"] == number].iloc[
                0
            ]["Number"]
            adata_org.obs["sample"] = sample

            for feature in df.columns:
                adata_org.obs[feature] = np.array(df[feature])

            anndatas.append(adata_org)

    adata = ad.concat(anndatas)
    stain_vector = [x in good_stains for x in adata.var_names]
    adata = adata[:, stain_vector]

    # Unique id for vars
    adata.var_names = np.array(adata.var_names) + "_" + compartement

    # Unique id for obs
    adata.obs["unique_id"] = adata.obs["ID"].astype(str) + adata.obs["sample"].astype(
        str
    )
    adata.obs_names = adata.obs["unique_id"]
    anndatas_compartements.append(adata)

adata = ad.concat(anndatas_compartements, axis=1, merge="same")
# Assert that all samples are there + all proteins
assert (len(cytoplamsa_membrane) + len(nuclear)) == len(adata.var_names)
assert len(adata.obs["sample"].unique()) == len(samples)

adata.obs["sample_num_str"] = adata.obs["sample_num"].astype(str)
wls_time_course = pd.DataFrame()
wls_time_course["Log Expression"] = adata[:, "GPR177WLS_cytoplasm"].X[:, 0]
wls_time_course["Condition"] = np.array(adata.obs["Condition"])
wls_time_course["Day"] = np.array(adata.obs["Day"])
wls_time_course = wls_time_course[wls_time_course["Day"] == 15]
wls_time_course["protein"] = "WLS"
yap_wls_time_course = pd.read_csv("Brain_ECM_1/DAPI/yap_mean_day_16.csv")

yap_wls_time_course = pd.concat([yap_wls_time_course, wls_time_course])

In [None]:
# Test Matrix No matrix WLS YAP
protein_yap = "YAP1"
yap_mat = yap_wls_time_course[
    (yap_wls_time_course["protein"] == protein_yap)
    & (yap_wls_time_course["Condition"] == "Matrigel")
]
yap_no_mat = yap_wls_time_course[
    (yap_wls_time_course["protein"] == protein_yap)
    & (yap_wls_time_course["Condition"] == "No Matrix")
]
# Wilcoxon test
U_yap, p_yap = mannwhitneyu(yap_mat["Log Expression"], yap_no_mat["Log Expression"])

# Test Matrix No matrix YAP
protein_wls = "WLS"

wls_mat = yap_wls_time_course[
    (yap_wls_time_course["protein"] == protein_wls)
    & (yap_wls_time_course["Condition"] == "Matrigel")
]
wls_no_mat = yap_wls_time_course[
    (yap_wls_time_course["protein"] == protein_wls)
    & (yap_wls_time_course["Condition"] == "No Matrix")
]

U_wls, p_wls = mannwhitneyu(wls_mat["Log Expression"], wls_no_mat["Log Expression"])

p_vals_figure_6b = pd.DataFrame(
    [p_yap, p_wls],
    index=["Wilcoxon rank-sum test Yap1", "Wilcoxon rank-sum test WLS"],
    columns=["p-value"],
)
p_vals_figure_6b["comparison"] = "Mat_NoMat"
p_vals_figure_6b["Figure"] = "Fig. 6b"
p_vals_figure_6b["padj"] = multipletests(p_vals_figure_6b["p-value"], method="fdr_bh")[
    1
]

In [None]:
proteins = ["SOX10_nuclei", "SFRP2_cytoplasm", "Pax6_nuclei"]
p_vals_figure_5_e = pd.DataFrame()

for protein in proteins:
    time_course = pd.DataFrame()
    time_course[f"{protein}"] = adata[:, protein].X[:, 0]
    time_course["Condition"] = np.array(adata.obs["Condition"])
    time_course["Day"] = np.array(adata.obs["Day"])
    for day in np.unique(time_course["Day"]):
        time_course_day = time_course[time_course["Day"] == day]
        time_course_day_matrix = time_course_day[
            time_course_day["Condition"] == "Matrigel"
        ]
        time_course_day_no_matrix = time_course_day[
            time_course_day["Condition"] == "No Matrix"
        ]
        U1, p_prot = mannwhitneyu(
            time_course_day_matrix[protein], time_course_day_no_matrix[protein]
        )
        protein_name = protein.split("_")[0]
        p_vals_figure_5_e = pd.concat(
            [
                p_vals_figure_5_e,
                pd.DataFrame(
                    [p_prot],
                    index=[f"Wilcoxon rank-sum test {protein_name} Day {day}"],
                    columns=["p-value"],
                ),
            ]
        )
p_vals_figure_5_e["comparison"] = "Mat_NoMat"
p_vals_figure_5_e["Figure"] = "Fig. 5e"
p_vals_figure_5_e["padj"] = multipletests(
    p_vals_figure_5_e["p-value"], method="fdr_bh"
)[1]

In [None]:
anndatas_compartements = []
for ann_dir, compartement, good_stains in zip(
    [ann_dir_ecm_niche], ["ecm"], [ecm_genes]
):
    anndatas = []
    for sample in samples:
        number = int(sample.split("_")[0].split("R")[1])

        if (overview["Number"] == number).any():
            adata_org = ad.read_h5ad(Path(ann_dir, f"{sample}.h5ad"))
            adata_org = adata_org[adata_org.obs["area"] > 100]
            df = pd.read_csv(Path(morpho_dir, f"{sample}.csv"))
            df = df[df["label"].isin(adata_org.obs["ID"])]

            assert (np.array(df["label"]) == np.array(adata_org.obs["ID"])).all()

            adata_org.obs["Day"] = overview[overview["Number"] == number].iloc[0]["Day"]
            adata_org.obs["Condition"] = overview[overview["Number"] == number].iloc[0][
                "Condition"
            ]
            adata_org.obs["Block"] = overview[overview["Number"] == number].iloc[0][
                "Block"
            ]
            adata_org.obs["sample_num"] = overview[overview["Number"] == number].iloc[
                0
            ]["Number"]
            adata_org.obs["sample"] = sample

            for feature in df.columns:
                adata_org.obs[feature] = np.array(df[feature])

            anndatas.append(adata_org)

    adata = ad.concat(anndatas)
    stain_vector = [x in good_stains for x in adata.var_names]
    adata = adata[:, stain_vector]

    # Unique id for vars
    adata.var_names = np.array(adata.var_names) + "_" + compartement

    # Unique id for obs
    adata.obs["unique_id"] = adata.obs["ID"].astype(str) + adata.obs["sample"].astype(
        str
    )
    adata.obs_names = adata.obs["unique_id"]

    # Remove small nuclei/cytoplasmas
    anndatas_compartements.append(adata)
adata = ad.concat(anndatas_compartements, axis=1, merge="same")
# assert len(ecm_genes) == len(adata.var_names)
adata.obs["sample_num_str"] = adata.obs["sample_num"].astype(str)

# Load other h5 and only keep is in unique id
ann_path = Path(dir_output, "brain_ecm_4i_2_v2_mean.h5ad")
adata_cyto_nuc = ad.read_h5ad(ann_path)
adata = adata[adata.obs["unique_id"].isin(adata_cyto_nuc.obs["unique_id"])]

In [None]:
proteins = ["COL4A1_ecm", "Col2A1_ecm", "Laminin_ecm"]
anova_p_vals_figure_4_k = pd.DataFrame()

for protein in proteins:
    time_course = pd.DataFrame()
    time_course[f"{protein}"] = adata[:, protein].X[:, 0]
    time_course["Condition"] = np.array(adata.obs["Condition"])
    time_course["Day"] = np.array(adata.obs["Day"])

    for condition in tqdm(np.unique(time_course["Condition"])):
        time_course_condition = time_course[time_course["Condition"] == condition]

        data_by_day = []
        for day in [7, 15, 21]:
            values = time_course_condition[time_course_condition["Day"] == day][protein]
            data_by_day.append(values)

        # Perform one-way ANOVA
        F_stat, p_anova = f_oneway(*data_by_day)
        protein_name = protein.split("_")[0]

        anova_p_vals_figure_4_k = pd.concat(
            [
                anova_p_vals_figure_4_k,
                pd.DataFrame(
                    [
                        [
                            p_anova,
                            f"Day_7_15_21_{condition}",
                            np.hstack(data_by_day).shape[0],
                        ]
                    ],
                    index=[f"Anova extracellular {protein_name}"],
                    columns=["p-value", "comparison", "n"],
                ),
            ]
        )

In [None]:
anndatas_compartements = []
for ann_dir, compartement, good_stains in zip(
    [ann_dir_cytoplasma], ["cellular_ecm"], [ecm_genes]
):
    anndatas = []
    for sample in samples:
        number = int(sample.split("_")[0].split("R")[1])

        if (overview["Number"] == number).any():
            adata_org = ad.read_h5ad(Path(ann_dir, f"{sample}.h5ad"))
            adata_org = adata_org[adata_org.obs["area"] > 100]
            df = pd.read_csv(Path(morpho_dir, f"{sample}.csv"))
            df = df[df["label"].isin(adata_org.obs["ID"])]

            assert (np.array(df["label"]) == np.array(adata_org.obs["ID"])).all()

            adata_org.obs["Day"] = overview[overview["Number"] == number].iloc[0]["Day"]
            adata_org.obs["Condition"] = overview[overview["Number"] == number].iloc[0][
                "Condition"
            ]
            adata_org.obs["Block"] = overview[overview["Number"] == number].iloc[0][
                "Block"
            ]
            adata_org.obs["sample_num"] = overview[overview["Number"] == number].iloc[
                0
            ]["Number"]
            adata_org.obs["sample"] = sample

            for feature in df.columns:
                adata_org.obs[feature] = np.array(df[feature])

            anndatas.append(adata_org)

    adata = ad.concat(anndatas)
    stain_vector = [x in good_stains for x in adata.var_names]
    adata = adata[:, stain_vector]

    # Unique id for vars
    adata.var_names = np.array(adata.var_names) + "_" + compartement

    # Unique id for obs
    adata.obs["unique_id"] = adata.obs["ID"].astype(str) + adata.obs["sample"].astype(
        str
    )
    adata.obs_names = adata.obs["unique_id"]

    # Remove small nuclei/cytoplasmas
    anndatas_compartements.append(adata)
adata = ad.concat(anndatas_compartements, axis=1, merge="same")
# assert len(ecm_genes) == len(adata.var_names)
adata.obs["sample_num_str"] = adata.obs["sample_num"].astype(str)

# Load other h5 and only keep is in unique id
ann_path = Path(dir_output, "brain_ecm_4i_2_v2_mean.h5ad")
adata_cyto_nuc = ad.read_h5ad(ann_path)
adata = adata[adata.obs["unique_id"].isin(adata_cyto_nuc.obs["unique_id"])]

In [None]:
proteins = ["COL4A1_cellular_ecm", "Col2A1_cellular_ecm", "Laminin_cellular_ecm"]
for protein in proteins:
    time_course = pd.DataFrame()
    time_course[f"{protein}"] = adata[:, protein].X[:, 0]
    time_course["Condition"] = np.array(adata.obs["Condition"])
    time_course["Day"] = np.array(adata.obs["Day"])

    for condition in tqdm(np.unique(time_course["Condition"])):
        time_course_condition = time_course[time_course["Condition"] == condition]

        data_by_day = []
        for day in [7, 15, 21]:
            values = time_course_condition[time_course_condition["Day"] == day][protein]
            data_by_day.append(values)

        # Perform one-way ANOVA
        F_stat, p_anova = f_oneway(*data_by_day)
        protein_name = protein.split("_")[0]
        anova_p_vals_figure_4_k = pd.concat(
            [
                anova_p_vals_figure_4_k,
                pd.DataFrame(
                    [
                        [
                            p_anova,
                            f"Day_7_15_21_{condition}",
                            np.hstack(data_by_day).shape[0],
                        ]
                    ],
                    index=[f"Anova extracellular {protein_name}"],
                    columns=["p-value", "comparison", "n"],
                ),
            ]
        )
anova_p_vals_figure_4_k["Figure"] = "Fig. 4k"
anova_p_vals_figure_4_k["padj"] = multipletests(
    anova_p_vals_figure_4_k["p-value"], method="fdr_bh"
)[1]

In [None]:
adata = ad.read_h5ad("brain_organoid_4i_ecm_cytoplasm_nuclei.h5ad")
var_names = np.array(adata.var_names)

# Define colors
color_volume_midnight_blue = matplotlib.colors.LinearSegmentedColormap.from_list(
    "", ["#c9c7c7", "#191970"]
)
color_pallete_perturbation = {"Matrigel": "#17ad97", "No Matrix": "#4d4d4d"}
colors = sns.color_palette("hls", len(np.unique(adata.obs["leiden"])))


cross_tab = pd.crosstab(
    adata.obs["Condition"],
    adata.obs["Cluster_annotations_col"],
).T

cross_tab = cross_tab.reindex(
    [
        "Unknown",
        "N. Epi.",
        "Pros. Prog.",
        "NC neurons",
        "NCCs",
        "Non-Tel. Prog.",
        "Non-Tel. neurons",
        "Die. Prog.",
        "Tel. Prog.",
    ]
)

In [None]:
# Prepare results for Fisher's Exact Test
results = []

for index, row in cross_tab.iterrows():
    contingency_table = [
        [row["Matrigel"], row["No Matrix"]],
        [
            cross_tab["Matrigel"].sum() - row["Matrigel"],
            cross_tab["No Matrix"].sum() - row["No Matrix"],
        ],
    ]

    odds_ratio, p_value = fisher_exact(contingency_table)

    results.append({"Region": f"{index}", "Odds Ratio": odds_ratio, "p-value": p_value})

results_df = pd.DataFrame(results)
results_df["padj"] = multipletests(results_df["p-value"], method="fdr_bh")[1]

results_df["comparison"] = "Mat_NoMat"
results_df["Figure"] = "Fig. 4h"
results_df.to_csv("fig_4_h.csv")

In [None]:
p_vals_figures = pd.concat(
    [anova_p_vals_figure_4_k, p_vals_figure_5_e, p_vals_figure_6b]
)
p_vals_figures.index = p_vals_figures.index.str.replace(r"Laminin", "LAMC3", regex=True)
p_vals_figures.index = p_vals_figures.index.str.replace(r"Col2A1", "COL2A1", regex=True)
p_vals_figures.index = p_vals_figures.index.str.replace(r"Pax6", "PAX6", regex=True)
p_vals_figures.index = p_vals_figures.index.str.replace(r"Yap1", "YAP1", regex=True)
p_vals_figures = p_vals_figures[["p-value", "padj", "comparison", "Figure"]]
p_vals_figures.to_csv("p_values_figures.csv")