In [1]:
import numpy as np
import scipy
import pandas as pd
import scanpy as sc

from pathlib import Path


DPI = 300
FONTSIZE = 20  # 42
sc.settings.set_figure_params(
    scanpy=True, dpi=100, transparent=True, vector_friendly=True, dpi_save=DPI
)
from matplotlib import rcParams

rcParams["pdf.fonttype"] = 42

In [2]:
SAMPLE_NAME = "concat_withWu2022" #"Wu2022_ST_LM4"

if "concat_withWu2022" in SAMPLE_NAME:
    # load cell2location results
    DIR2LOAD = Path(
        f"/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cell2location/{SAMPLE_NAME}"
    )
    DIR2LOAD.mkdir(parents=True, exist_ok=True)

    run_name = DIR2LOAD.joinpath(f"cell2location_map-no_cycling_TME/")
    run_name.mkdir(parents=True, exist_ok=True)

    # load gene expression
    DIR2GENEXP = Path(f"/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/{SAMPLE_NAME}/{SAMPLE_NAME}_raw.h5ad")
    adata_genexp = sc.read_h5ad(DIR2GENEXP)
    

if "Wu2022_ST" in SAMPLE_NAME:
    # load cell2location results
    DIR2LOAD = Path(
        f"/data/BCI-CRC/nasrine/data/CRC/spatial/public/Visium_Wu_2022/cell2location/{SAMPLE_NAME}"
    )
    DIR2LOAD.mkdir(parents=True, exist_ok=True)

    run_name = DIR2LOAD.joinpath(f"cell2location_map-no_cycling_TME/")
    run_name.mkdir(parents=True, exist_ok=True)

    # load gene expression
    DIR2GENEXP = Path(f"/data/BCI-CRC/nasrine/data/CRC/spatial/public/Visium_Wu_2022/qc/{SAMPLE_NAME}/{SAMPLE_NAME}_raw.h5ad")
    adata_genexp = sc.read_h5ad(DIR2GENEXP)
    
if "CRC" in SAMPLE_NAME:
    # load cell2location results
    DIR2LOAD = Path(
        f"/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cell2location/{SAMPLE_NAME}"
    )
    DIR2LOAD.mkdir(parents=True, exist_ok=True)

    run_name = DIR2LOAD.joinpath(f"cell2location_map-no_cycling_TME/")
    run_name.mkdir(parents=True, exist_ok=True)

    # load gene expression
    DIR2GENEXP = Path(f"/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/qc/{SAMPLE_NAME}/{SAMPLE_NAME}_raw.h5ad")
    adata_genexp = sc.read_h5ad(DIR2GENEXP)    

# load joint analysis of microenvrionmentts
MICROENV_DIR = Path("/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cell2loc_spatialde2/concat_withWu2022/")
adata_microenv = sc.read_h5ad(MICROENV_DIR.joinpath("sp_segmentation_smoothness1.2.h5ad"))


In [3]:
# only select spots that are in adata_genexp
adata_microenv = adata_microenv[adata_microenv.obs.index.isin(adata_genexp.obs.index)].copy()

In [4]:
adata_microenv.shape

(16108, 55)

In [5]:
# create adata gene exp with microenv assignment
adata_genexp.obs = adata_genexp.obs.merge(adata_microenv.obs[["segmentation_labels"]],
                       how="left",
                       left_index=True,
                       right_index=True
                      )

In [6]:
adata_genexp.obs.segmentation_labels.value_counts()

9    2666
8    2459
1    2058
4    1972
5    1572
7    1565
6    1074
3    1036
2     873
0     833
Name: segmentation_labels, dtype: int64

In [7]:
### log normalise 
# keep raw
adata_genexp.layers["raw"] = adata_genexp.X.copy()  # preserve counts

# normalize + log1p
sc.pp.normalize_total(adata_genexp, target_sum=1e4, inplace=True)
adata_genexp.layers["normalised"] = adata_genexp.X.copy()
sc.pp.log1p(adata_genexp)

adata_genexp.layers["log1p"] = adata_genexp.X.copy()

adata_genexp.raw = adata_genexp  # keep normalised log1p

In [8]:
adata_genexp.obs.segmentation_labels.cat.categories

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

### DE regions

In [9]:
#### segementation_labels column is int64 but for markers we need str (object)
adata_genexp.obs["clustering"] = adata_genexp.obs.segmentation_labels.astype("str")
# set it as a category
adata_genexp.obs["clustering"] = adata_genexp.obs["clustering"].astype("category")

In [10]:
adata_genexp.obs.clustering.cat.categories

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='object')

In [11]:
# compute DE
sc.tl.rank_genes_groups(
    adata_genexp,
    groupby="clustering",
    reference="rest",
    method="wilcoxon",
    use_raw=True,
    layer=None,
    pts=True,
    corr_method="benjamini-hochberg",
    key_added="rank_genes_wilcoxon",
)

In [12]:
### Get top 300 DEGs for each Cell_subtype
pval_thresh = 0.05
log2fc_thresh = 0.25
pct_cutoff = 0.1
cluster_de_genes = dict()

n_genes = 300

top_n_genes = dict()

for cluster in adata_genexp.obs.clustering.cat.categories: #sorted(set(adata_vis.obs[DE_grouping])):
    cluster_de_genes[cluster] = sc.get.rank_genes_groups_df(adata_genexp,
                                                            group=cluster, 
                                                            key='rank_genes_wilcoxon', 
                                                            pval_cutoff=pval_thresh, 
                                                            log2fc_min=log2fc_thresh, 
                                                            log2fc_max=None).sort_values('logfoldchanges', 
                                                                                         ascending=False)
    cluster_de_genes[cluster] = cluster_de_genes[cluster][cluster_de_genes[cluster]['pct_nz_group'] > pct_cutoff]
    top_n_genes[cluster] = list(cluster_de_genes[cluster]['names'][0:n_genes])

In [14]:
for key in top_n_genes.keys():
    print(key, len(top_n_genes[key]))

0 300
1 300
2 300
3 300
4 300
5 300
6 300
7 300
8 300
9 300


### Don't have same lenght so this is a problem for ppd.DataFrame (we could pad with nan vvalues)

In [15]:
def GetMax(mydict):
    return max((k, len(v)) for k,v in mydict.items())

In [16]:
GetMax(top_n_genes)

('9', 300)

In [17]:
max_length = GetMax(top_n_genes)[1]
for k in top_n_genes.keys():
    if len(top_n_genes[k]) < max_length:
        # pad with NaN
        top_n_genes[k] += (max_length-len(top_n_genes[k])) * [np.nan]

In [18]:
for key in top_n_genes.keys():
    print(key, len(top_n_genes[key]))

0 300
1 300
2 300
3 300
4 300
5 300
6 300
7 300
8 300
9 300


## save file for DE analysis 

In [19]:
DIR2SAVE = MICROENV_DIR.joinpath("microenvs_geneexp/")
DIR2SAVE.mkdir(parents=True, exist_ok=True)

In [20]:
MICROENV_DIR

PosixPath('/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cell2loc_spatialde2/concat_withWu2022')

In [21]:
SAMPLE_NAME

'concat_withWu2022'

In [22]:
pd.DataFrame.from_dict(top_n_genes, orient='columns')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,SULF1,ALB,IGLC1,ECRG4,MCOLN3,KRT17,ALB,CLC,RIIAD1,ALB
1,RETN,APOA1,IGKC,FBLN1,B3GALT2,MMP7,SERPINA1,GABRA2,GAL,APOC3
2,COL11A1,SERPINA1,IGLC2,MUSTN1,ASB4,SPP1,APOA1,CEACAM7,SMIM32,FGB
3,COL1A2,FGA,CXCL13,F13A1,LRRN3,C2CD4A,HAMP,LINC01559,UBE2T,APOA1
4,COL1A1,APOC3,IGHG3,SNCG,AC007493.1,DMBT1,SAA1,MIR210HG,CENPW,FGA
...,...,...,...,...,...,...,...,...,...,...
295,CXCR4,AS3MT,KCNH6,TUBB6,AC132192.2,PCDH1,CALU,PTPRN2,BUB1B,CYP4F3
296,FZD2,PLA2G5,DKK1,GNG11,FRMD1,TAP1,EIF3H,FUT3,FANCD2,KDM8
297,LBH,ASS1,NLRC5,PLVAP,COLCA1,RHPN1,AC007952.4,HOXB-AS3,TRAIP,ABCB4
298,WIPF1,OSMR,COL28A1,FOS,AL050341.2,ASIC1,HSP90AA1,ITPKA,CEP72,ACOX2


In [23]:
top_n_genes

{'0': ['SULF1',
  'RETN',
  'COL11A1',
  'COL1A2',
  'COL1A1',
  'OSM',
  'INHBA',
  'OLR1',
  'POSTN',
  'COL15A1',
  'CDH11',
  'VCAN',
  'ADAM12',
  'TNFAIP6',
  'NTM',
  'COL8A1',
  'COL10A1',
  'HOPX',
  'THBS2',
  'COL5A2',
  'GREM1',
  'EDIL3',
  'ISLR',
  'CTHRC1',
  'FNDC1',
  'COL4A2',
  'THY1',
  'IGFBP5',
  'COL4A1',
  'SPARC',
  'PDPN',
  'FAP',
  'CCL8',
  'COL5A1',
  'ITGA11',
  'OLFML2B',
  'CD248',
  'ANTXR1',
  'KIAA1755',
  'MMP11',
  'OSCAR',
  'COL3A1',
  'MMP28',
  'MXRA5',
  'CSPG4',
  'COL6A3',
  'TREM1',
  'PRRX1',
  'LAMA4',
  'TIMP1',
  'LGALS1',
  'MYL9',
  'MGP',
  'GFPT2',
  'HTRA3',
  'GPNMB',
  'PLAT',
  'ACTA2',
  'SPP1',
  'C5orf46',
  'LZTS1',
  'ADAMTS12',
  'TYMP',
  'SPOCK1',
  'IL4I1',
  'LTBP2',
  'PLAU',
  'CCL18',
  'MXRA8',
  'PLXDC1',
  'COL12A1',
  'ANGPT2',
  'EMILIN1',
  'TREM2',
  'IFI30',
  'CERCAM',
  'LOXL2',
  'MMP14',
  'NOTCH3',
  'MSR1',
  'TAGLN',
  'AEBP1',
  'SSC5D',
  'LYZ',
  'MRC2',
  'FCGR1A',
  'RAB31',
  'RGS16',
  'TGFB3'

In [24]:
top_n_genes_df = pd.DataFrame(top_n_genes)

top_n_genes_df.to_csv(
    DIR2SAVE.joinpath(f"{SAMPLE_NAME}_DE_microenvs.csv"), 
    header=True,
    index=False,
    sep=","

)

top_n_genes_df.to_excel(
    DIR2SAVE.joinpath(f"{SAMPLE_NAME}_DE_microenvs.xlsx"),
    header=True,
    index=False,
)

In [25]:
top_n_genes_df.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='object')

In [26]:
top_n_genes_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,SULF1,ALB,IGLC1,ECRG4,MCOLN3,KRT17,ALB,CLC,RIIAD1,ALB
1,RETN,APOA1,IGKC,FBLN1,B3GALT2,MMP7,SERPINA1,GABRA2,GAL,APOC3
2,COL11A1,SERPINA1,IGLC2,MUSTN1,ASB4,SPP1,APOA1,CEACAM7,SMIM32,FGB
3,COL1A2,FGA,CXCL13,F13A1,LRRN3,C2CD4A,HAMP,LINC01559,UBE2T,APOA1
4,COL1A1,APOC3,IGHG3,SNCG,AC007493.1,DMBT1,SAA1,MIR210HG,CENPW,FGA
...,...,...,...,...,...,...,...,...,...,...
295,CXCR4,AS3MT,KCNH6,TUBB6,AC132192.2,PCDH1,CALU,PTPRN2,BUB1B,CYP4F3
296,FZD2,PLA2G5,DKK1,GNG11,FRMD1,TAP1,EIF3H,FUT3,FANCD2,KDM8
297,LBH,ASS1,NLRC5,PLVAP,COLCA1,RHPN1,AC007952.4,HOXB-AS3,TRAIP,ABCB4
298,WIPF1,OSMR,COL28A1,FOS,AL050341.2,ASIC1,HSP90AA1,ITPKA,CEP72,ACOX2
