In [42]:
# Imports
import csv
import glob2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scanpy as sc
import squidpy as sq
import warnings
from anndata import AnnData

In [3]:
# Globals
PRJ_DIR = "/scratch/gpfs/KANG/sereno/spatialstem"
SRC_DIR = f"{PRJ_DIR}/sourcefiles"
RAW_DIR = f"{SRC_DIR}/raw"
HAD_DIR = f"{SRC_DIR}/h5ad"
INT_DIR = f"{PRJ_DIR}/intermediates"
FIG_DIR = f"{PRJ_DIR}/figs"

In [24]:
# Dann: Mapping the developing human immune system across organs
dann_dir = f"{RAW_DIR}/pub1_dann"
dann_paths = glob2.glob(f"{dann_dir}/*")
# Already formatted...

In [53]:
# Niec: Lymphatics act as a signaling hub to regulate intestinal stem cell activity
niec_dir = f"{RAW_DIR}/pub2_niec"
niec_paths = glob2.glob(f"{niec_dir}/*")
niec_paths.sort()
niec_labels = ["largeintestine1", "largeintestine2", "smallintestine1", "smallintestine2"]
for niec_path, niec_label in zip(niec_paths, niec_labels):
    # Will warn you that your variable names aren't unique, fixed below.
    with warnings.catch_warnings(action="ignore"):
        visium_in = sc.read_visium(niec_path)
    visium_in.var_names_make_unique()
    h5_out_path = f"{HAD_DIR}/p2_{niec_label}.h5ad"
    visium_in.write_h5ad(h5_out_path)
    print(f"Written: {h5_out_path}")

Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p2_largeintestine1.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p2_largeintestine2.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p2_smallintestine1.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p2_smallintestine2.h5ad


In [49]:
# Calvanese: Mapping human hematopoietic stem cells from hemogenic endothelium to birth
calv_dir = f"{RAW_DIR}/pub3_calvanese"
calv_paths = glob2.glob(f"{calv_dir}/*")
calv_paths.sort()
for idx, calv_path in enumerate(calv_paths):
    with warnings.catch_warnings(action="ignore"):
        visium_in = sc.read_visium(calv_path)
    visium_in.var_names_make_unique()
    h5_out_path = f"{HAD_DIR}/p3_hsc{idx+1}.h5ad"
    visium_in.write_h5ad(h5_out_path)
    print(f"Written: {h5_out_path}")

['/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub3_calvanese/HM-1', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub3_calvanese/HM-2', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub3_calvanese/HM-4', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub3_calvanese/HM-5', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub3_calvanese/HM-6', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub3_calvanese/HM-7', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub3_calvanese/HM-8']


In [90]:
# Larouche: Spatiotemporal mapping of immune and stem cell dysregulation after volumetric muscle loss
laro_dir = f"{RAW_DIR}/pub4_larouche"
laro_paths_raw = glob2.glob(f"{laro_dir}/*")
# Filters out raw RDS objects
laro_paths = [path for path in laro_paths_raw if 'GSE205707' not in path]
laro_paths.sort()
for laro_path in laro_paths:
    laro_lab = laro_path.split("/")[-1]
    visium_in = sc.read_10x_mtx(laro_path)
    coords_in = np.genfromtxt(f"{laro_path}/coords.csv", delimiter=",", dtype="int64")
    visium_in.obsm["spatial"] = coords_in
    h5_out_path = f"{HAD_DIR}/p4_{laro_lab}.h5ad"
    visium_in.write_h5ad(h5_out_path)
    print(f"Written: {h5_out_path}")

Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_caD0IR1.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_caD0IR2.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_caD14E1.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_caD14M1.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_caD14M2.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_caD7E1.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_caD7M1.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_caD7M2.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_mmD141.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_mmD142.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_mmITD1.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/h5ad/p4_mmITD2.h5ad
Written: /scratch/gpfs/KANG/sereno/spatialstem/

In [5]:
# # Error in this dataset (Shani, P5), deprecated for now.
# # Shani: The spatio-temporal program of liver zonal regeneration
# shan_dir = f"{RAW_DIR}/pub5_shani"
# shan_paths = glob2.glob(f"{shan_dir}/*.h5")
# shan_paths.sort()
# print(shan_paths)
# # Read metadata to get visium coords
# visium_meta_dict = {}
# with open(f'{shan_dir}/Visium_Meta_data.txt', mode ='r')as meta_file:
#     shan_meta = csv.reader(meta_file)
#     next(shan_meta)
#     for line in shan_meta:
#         sample = line[1].replace("t_", "")
#         x_coord = np.int64(line[4])
#         y_coord = np.int64(line[5])
#         coords = [x_coord, y_coord]
#         if sample not in visium_meta_dict:
#             visium_meta_dict[sample] = []
#         visium_meta_dict[sample].append(coords)
# # Notice that this does not work due to the visium_in not lining up with metadata.
# for shan_path in shan_paths:
#     # Extract mouse id
#     shan_lab = shan_path.split("/")[-1].replace("Visium_", "").replace("_raw_feature_bc_matrix.h5", "")
#     # Will warn you that your variable names aren't unique, fixed below.
#     with warnings.catch_warnings(action="ignore"):
#         visium_in = sc.read_10x_h5(shan_path)
#     visium_in.var_names_make_unique()
#     coords = np.asarray(visium_meta_dict[shan_lab])
#     visium_in.obsm["spatial"] = coords
#     h5_out_path = f"{HAD_DIR}/p5_{shan_lab}.h5ad"
#     # visium_in.write_h5ad(h5_out_path)

['/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub5_shani/Visium_24h_m1_raw_feature_bc_matrix.h5', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub5_shani/Visium_24h_m2_raw_feature_bc_matrix.h5', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub5_shani/Visium_48h_m4_raw_feature_bc_matrix.h5', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub5_shani/Visium_48h_m5_raw_feature_bc_matrix.h5', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub5_shani/Visium_72h_m1_raw_feature_bc_matrix.h5', '/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub5_shani/Visium_72h_m2_raw_feature_bc_matrix.h5']


In [40]:
# Some example Shani objects: obj 1
shan_path = shan_paths[1]
shan_lab = shan_path.split("/")[-1].replace("Visium_", "").replace("_raw_feature_bc_matrix.h5", "")
# Will warn you that your variable names aren't unique, fixed below.
with warnings.catch_warnings(action="ignore"):
    visium_in = sc.read_10x_h5(shan_path)
visium_in.var_names_make_unique()
print(shan_path)
visium_in

/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub5_shani/Visium_24h_m2_raw_feature_bc_matrix.h5


AnnData object with n_obs × n_vars = 4992 × 32285
    var: 'gene_ids', 'feature_types', 'genome'

In [41]:
# Some example Shani objects: obj 2
shan_path = shan_paths[2]
shan_lab = shan_path.split("/")[-1].replace("Visium_", "").replace("_raw_feature_bc_matrix.h5", "")
# Will warn you that your variable names aren't unique, fixed below.
with warnings.catch_warnings(action="ignore"):
    visium_in = sc.read_10x_h5(shan_path)
visium_in.var_names_make_unique()
print(shan_path)
visium_in

/scratch/gpfs/KANG/sereno/spatialstem/sourcefiles/raw/pub5_shani/Visium_48h_m4_raw_feature_bc_matrix.h5


AnnData object with n_obs × n_vars = 4991 × 32285
    var: 'gene_ids', 'feature_types', 'genome'

In [51]:
# Index of count files and spatial info should be equivalent.
bier_dir = f"{RAW_DIR}/pub6_biermann"
bier_count_paths = glob2.glob(f"{bier_dir}/*raw_counts*")
bier_count_paths.sort()
bier_coord_paths = glob2.glob(f"{bier_dir}/*spatial_info*")
bier_coord_paths.sort()
for bier_count_path, bier_coord_path in zip(bier_count_paths, bier_coord_paths):

In [76]:
zip_idx = 0
bier_count_path = bier_count_paths[zip_idx]
bier_coord_path = bier_coord_paths[zip_idx]
counts = pd.read_csv(bier_count_path, compression='gzip', index_col=0).T
coords = pd.read_csv(bier_coord_path, compression='gzip', usecols=['xcoord', 'ycoord'])

In [77]:
# GSM6025935_MBM05_rep1_slide_raw_counts.csv.gz
sample_id = bier_count_path.replace(f"{bier_dir}/GSM6025935_", "").replace("_slide_raw_counts.csv.gz", "")
sample_id

'MBM05_rep1'

In [78]:
coords

Unnamed: 0,xcoord,ycoord
0,918.0,3452.4
1,829.8,2519.6
2,1319.3,3672.5
3,1256.4,3816.7
4,673.9,2118.0
...,...,...
29531,4657.4,2585.6
29532,3364.4,3928.6
29533,2599.5,851.6
29534,3127.7,3007.6


In [79]:
counts

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,A3GALT2,A4GALT,AAAS,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,snoZ196
ACCACTCATTTCTC-1,0,0,0,6,0,0,0,0,0,3,...,3,0,0,2,0,2,0,0,0,0
GTTCANTCCACGTA-1,0,0,0,6,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
CCGTAGCCCGCACT-1,0,0,0,7,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
CTTAACCGGGTCTC-1,0,0,0,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
TGGCATCTTCACAC-1,0,0,0,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTGTGAACATCTGG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AGCGGACAGTGTGC-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TTGGAGGCCGAGGT-1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
ACGCGCAATCGTAG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
np.asarray(coords).shape

(29536, 2)

In [100]:
del bier_ad.obs
bier_ad

AnnData object with n_obs × n_vars = 29536 × 27854
    var: 'gene_ids', 'feature_types'
    obsm: 'spatial'

In [90]:
bier_ad = sc.AnnData(counts, counts.index.to_frame(), counts.columns.to_frame())
# Manual coordinate frame build
coords_arr = np.asarray(coords)
bier_ad.obsm["spatial"] = coords_arr
# Manual feature info frame build.
gene_names = counts.columns
feature_type_rep = ["Gene Expression"] * len(gene_names)
bier_gene_frame = pd.DataFrame(index=gene_names, data={"gene_ids": gene_names, "feature_types": feature_type_rep})
bier_ad.var = bier_gene_frame
h5_out_path = f"{HAD_DIR}/p6_{sample_id}.h5ad"
bier_ad.write_h5ad(h5_out_path)
print(f"Written: {h5_out_path}")

TypeError: 0 of type <class 'int'> is an invalid key. Should be str.

In [None]:
# Biermann: Dissecting the treatment-naive ecosystem of human melanoma brain metastasis
data = pd.read_csv('google-us-data.csv.gz', nrows=100, compression='gzip', error_bad_lines=False)