In [1]:
import anndata
import scanpy as sc
import squidpy as sq
import pandas as pd
from pathlib import Path
import scipy

# QC utils functions - a package
from vistools import utils

## Read in Ozato's data which is formated in csv as well as the tissue positions, images etc... and save in correct format we need to analyse 

**Workflow**

1. Create `anndata` from `.csv` file
    1. Transpose matrix
    2. Isolate barcodes, gene ids
    3. Get gene names for gene ids
    4. Get sparse matrix of counts
5. Format into spatial

In [2]:
DIR2LOAD = Path("/data/BCI-CRC/nasrine/data/CRC/spatial/public/Visium_Ozato_2023/")

In [3]:
SAMPLE = "A1_colorectal_cancer_1" # to change depending on which sample we want to format

In [4]:
DIR2LOAD.joinpath(SAMPLE,f"matrix_{SAMPLE}.csv")

PosixPath('/data/BCI-CRC/nasrine/data/CRC/spatial/public/Visium_Ozato_2023/A1_colorectal_cancer_1/matrix_A1_colorectal_cancer_1.csv')

In [5]:
DIR2SAVE = DIR2LOAD.joinpath("raw", SAMPLE)
DIR2SAVE.mkdir(parents=True, exist_ok=True)

### Let's load some data!

In [6]:
matrix_df = pd.read_csv(DIR2LOAD.joinpath(SAMPLE,f"matrix_{SAMPLE}.csv"), index_col=0)
matrix_df.head(5)

Unnamed: 0,AAACACCAATAACTGC-1,AAACAGGGTCTATATT-1,AAACATTTCCCGGATT-1,AAACCCGAACGAAATC-1,AAACCGGAAATGTTAA-1,AAACCGGGTAGGTACC-1,AAACCTAAGCAGCCGG-1,AAACCTCATGAAGTTG-1,AAACGAAGATGGAGTA-1,AAACGCTGGGCACGAC-1,...,TTGTACACCTCGAACA-1,TTGTCACCGCGGTATC-1,TTGTGAACCTAATCCG-1,TTGTGATCTGTTCAGT-1,TTGTGCAGCCACGTCA-1,TTGTGCGGAAGCGGAT-1,TTGTGGTGGTACTAAG-1,TTGTGTATGCCACCAA-1,TTGTGTTTCCCGAAAG-1,TTGTTTCACATCCAGG-1
ENSG00000243485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000186092,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000239945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 1. Create `anndata` from `.csv` file 
---

### A. Here the rows are the genes and the columns are the spots. For our analysis we will need to transpose this! 

In [7]:
matrix_df_t = matrix_df.transpose()
matrix_df_t.head(5)

Unnamed: 0,ENSG00000243485,ENSG00000237613,ENSG00000186092,ENSG00000238009,ENSG00000239945,ENSG00000239906,ENSG00000241860,ENSG00000241599,ENSG00000286448,ENSG00000236601,...,ENSG00000274175,ENSG00000275869,ENSG00000273554,ENSG00000278782,ENSG00000277761,ENSG00000277836,ENSG00000278633,ENSG00000276017,ENSG00000278817,ENSG00000277196
AAACACCAATAACTGC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACAGGGTCTATATT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACATTTCCCGGATT-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCCGAACGAAATC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACCGGAAATGTTAA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### B. Need to isolate genes and spot ids separately 

#### genes

In [8]:
# get genes
myvar = matrix_df_t.columns
myvar_df = pd.DataFrame(myvar, columns=["ensembl_gene_id"])
myvar_df

Unnamed: 0,ensembl_gene_id
0,ENSG00000243485
1,ENSG00000237613
2,ENSG00000186092
3,ENSG00000238009
4,ENSG00000239945
...,...
36596,ENSG00000277836
36597,ENSG00000278633
36598,ENSG00000276017
36599,ENSG00000278817


In [9]:
# check if LGR5 is in the data
myvar_df[myvar_df["ensembl_gene_id"]=='ENSG00000139292'] # LGR5 name

Unnamed: 0,ensembl_gene_id
22122,ENSG00000139292


In [10]:
"ENSG00000139292" in list(myvar_df["ensembl_gene_id"])

True

#### barcodes 

In [11]:
# get spot barcodes
mybarcodes = matrix_df_t.index
# create dataframe 
mybarcodes_df = pd.DataFrame(mybarcodes, columns=["barcode"]) #.set_index("barcode")
mybarcodes_df["Sample"] = [str(SAMPLE) for ele in mybarcodes_df.barcode]
mybarcodes_df

Unnamed: 0,barcode,Sample
0,AAACACCAATAACTGC-1,A1_colorectal_cancer_1
1,AAACAGGGTCTATATT-1,A1_colorectal_cancer_1
2,AAACATTTCCCGGATT-1,A1_colorectal_cancer_1
3,AAACCCGAACGAAATC-1,A1_colorectal_cancer_1
4,AAACCGGAAATGTTAA-1,A1_colorectal_cancer_1
...,...,...
2267,TTGTGCGGAAGCGGAT-1,A1_colorectal_cancer_1
2268,TTGTGGTGGTACTAAG-1,A1_colorectal_cancer_1
2269,TTGTGTATGCCACCAA-1,A1_colorectal_cancer_1
2270,TTGTGTTTCCCGAAAG-1,A1_colorectal_cancer_1


In [12]:
mybarcodes_df.set_index("barcode", drop=True, inplace=True)

### C. Get gene names using biomart from ensembl_gene_id attribute

In [15]:
import pybiomart
genes_df = sc.queries.biomart_annotations(org="hsapiens", attrs= ["ensembl_gene_id", "external_gene_name", 
                                                                  ], #"hgnc_symbol"
                                         host="grch37.ensembl.org") #www.ensembl.org
genes_df.head(5)

Unnamed: 0,ensembl_gene_id,external_gene_name
0,ENSG00000261657,SLC25A26
1,ENSG00000223116,AL157931.1
2,ENSG00000233440,HMGA1P6
3,ENSG00000207157,RNY3P4
4,ENSG00000229483,LINC00362


In [16]:
# drop rows with NaN in external_gene_name
genes_df.dropna(axis=0, subset="external_gene_name", inplace=True)

In [17]:
genes_df.head(5)

Unnamed: 0,ensembl_gene_id,external_gene_name
0,ENSG00000261657,SLC25A26
1,ENSG00000223116,AL157931.1
2,ENSG00000233440,HMGA1P6
3,ENSG00000207157,RNY3P4
4,ENSG00000229483,LINC00362


#### need to merge `myvar_df` with `genes_df` to get gene names for the genes that are in this visium slide

In [18]:
var_adata = myvar_df.merge(right=genes_df, how="left", left_on="ensembl_gene_id", right_on="ensembl_gene_id")
var_adata

Unnamed: 0,ensembl_gene_id,external_gene_name
0,ENSG00000243485,MIR1302-10
1,ENSG00000237613,FAM138A
2,ENSG00000186092,OR4F5
3,ENSG00000238009,RP11-34P13.7
4,ENSG00000239945,RP11-34P13.8
...,...,...
36596,ENSG00000277836,
36597,ENSG00000278633,
36598,ENSG00000276017,
36599,ENSG00000278817,


#### problem: when we do merge some of our gene ids do not have a name, should we just remove the ones with no name?? yes, but we should remove them later not now because we wont know which counts to remove 

#### check that we retain marker genes no?

In [19]:
var_adata[var_adata["external_gene_name"].isin(["LGR5", "ASCL2", "SMOC2", "KRT7", "KRT17", "PLAUR", "MMP7", "ANXA1",
                                                "LAMC2", "IFI6", "IFI44", "L1CAM", "EMP1",
                                                "TFF3", "MUC2", "ATOH1", "FABP1", "CA2", "SLC26A3"
                                               ])]

Unnamed: 0,ensembl_gene_id,external_gene_name
631,ENSG00000126709,IFI6
1354,ENSG00000137965,IFI44
2599,ENSG00000058085,LAMC2
4368,ENSG00000163586,FABP1
8643,ENSG00000172238,ATOH1
12970,ENSG00000112562,SMOC2
14094,ENSG00000091138,SLC26A3
15599,ENSG00000104267,CA2
16691,ENSG00000135046,ANXA1
18978,ENSG00000198788,MUC2


### D. Convert the counts to a sparse matrix 

In [20]:
counts = scipy.sparse.csr_matrix(matrix_df_t)

### E. Create anndata from sparse count matrix, barcodes, genes 

In [22]:
adata = anndata.AnnData(X=counts, var=var_adata, obs=mybarcodes_df, asview=False)

  adata = anndata.AnnData(X=counts, var=var_adata, obs=mybarcodes_df, asview=False)


#### drop the counts that dont have a gene name

In [28]:
# drop the counts that dont have a gene name
adata = adata[:, ~adata.var["external_gene_name"].isna()].copy()

In [29]:
adata.shape

(2272, 32374)

In [30]:
# set gene name as idx for .var
adata.var.set_index("external_gene_name", drop=True, inplace=True)

In [34]:
# make gene names unique for the index  
adata.var_names_make_unique() 
# problem duplicates remain after removing them 
# adata_st.var_names_make_unique()

In [35]:
# check that there are no more duplicates 
adata.var[adata.var.index.duplicated(keep=False)]

Unnamed: 0_level_0,ensembl_gene_id
external_gene_name,Unnamed: 1_level_1


In [36]:
# check that there are no more duplicates
adata.var_names[adata.var_names.duplicated(keep=False)]

Index([], dtype='object', name='external_gene_name')

# 2. Start creating a visium-like object for scanpy, squidpy etc 
---

In [37]:
adata_st = utils.adata_2_visium(adata=adata.copy(), # make a hard copy just in case 
                                path=DIR2LOAD / SAMPLE,
                                library_id=SAMPLE
                               )

In [38]:
adata_st

AnnData object with n_obs × n_vars = 2272 × 32374
    obs: 'Sample', 'in_tissue', 'array_row', 'array_col'
    var: 'ensembl_gene_id'
    uns: 'spatial'
    obsm: 'spatial'

# Save to file

In [45]:
adata_st.write(DIR2SAVE.joinpath(f"{SAMPLE}_raw.h5ad"))