In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
# !pip install scanpy

Collecting scanpy
  Downloading scanpy-1.10.2-py3-none-any.whl.metadata (9.3 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.10.8-py3-none-any.whl.metadata (6.6 kB)
Collecting h5py>=3.1 (from scanpy)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collecting legacy-api-wrap>=1.4 (from scanpy)
  Downloading legacy_api_wrap-1.4-py3-none-any.whl.metadata (1.8 kB)
Collecting natsort (from scanpy)
  Downloading natsort-8.4.0-py3-none-any.whl.metadata (21 kB)
Collecting numba>=0.56 (from scanpy)
  Downloading numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting patsy (from scanpy)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting pynndescent>=0.5 (from scanpy)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting seaborn>=0.13 (from scanpy)
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting session-

# Blood cell data

In [1]:
import scanpy

cell_types = [
    "cd8_cytotoxic_t",
    "cd8_cd45ra_naive_cytotoxic_t",
    "cd14_monocytes",
    "cd19_b",
    "cd34",
    "cd4_cd25_regulatory_t",
    "cd4_cd45ra_cd25_naive_t",
    "cd4_cd45ro_memory_t",
    "cd4_helper_t",
    "cd56_natural_killer",
]

data = {}
for cell_type in cell_types:
    data[cell_type] = scanpy.read_10x_mtx(f"/teamspace/studios/this_studio/embedders/data/blood_cell_scrna/{cell_type}")

In [2]:
# Restrict to landmark genes
import pandas as pd

landmark_genes_table = pd.read_table(
    "/teamspace/studios/this_studio/embedders/data/landmark_genes/GSE92742_Broad_LINCS_gene_info_delta_landmark.txt"
)
landmark_genes = landmark_genes_table["pr_gene_symbol"]

In [3]:
# This seems about right, based on what the Tabaghi paper claimed - we get 967, they get 965

for k, v in data.items():
    print(k, len(set(v.var.index) & set(landmark_genes)), sep="\t")

cd8_cytotoxic_t	967
cd8_cd45ra_naive_cytotoxic_t	967
cd14_monocytes	967
cd19_b	967
cd34	967
cd4_cd25_regulatory_t	967
cd4_cd45ra_cd25_naive_t	967
cd4_cd45ro_memory_t	967
cd4_helper_t	967
cd56_natural_killer	967


In [4]:
# Restrict to landmark genes and annotate cell type

data_filtered = {k: v[:, v.var.index.isin(landmark_genes)] for k, v in data.items()}
for k, v in data_filtered.items():
    v.obs["cell_type"] = k

# Finally, merge
adata = scanpy.AnnData.concatenate(*data_filtered.values(), batch_key="cell_type")
adata

  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  v.obs["cell_type"] = k
  adata = scanpy.AnnData.concatenate(*data_filtered.values(), batch_key="cell_type")


AnnData object with n_obs × n_vars = 94655 × 967
    obs: 'cell_type'
    var: 'gene_ids'

In [7]:
# Great, that's the correct shape! Let's save it and skip all of this stuff next time

adata.write("/teamspace/studios/this_studio/embedders/data/blood_cell_scrna/adata.h5ad")

# Lymphoma

In [2]:
import scanpy

cell_types = ["lymphoma_cells", "healthy_cells"]

data = {}
# These are gzipped inside
for cell_type in cell_types:
    data[cell_type] = scanpy.read_10x_mtx(
        f"/teamspace/studios/this_studio/embedders/data/blood_cell_scrna/{cell_type}", cache_compression="gzip")

In [3]:
for k, v in data.items():
    v.obs["cell_type"] = k

# Finally, merge
adata = scanpy.AnnData.concatenate(*data.values(), batch_key="cell_type")
adata

# This is the correct number of points for the paper

  adata = scanpy.AnnData.concatenate(*data.values(), batch_key="cell_type")


AnnData object with n_obs × n_vars = 13410 × 1056
    obs: 'cell_type'
    var: 'gene_ids', 'feature_types'

In [4]:
adata.write("/teamspace/studios/this_studio/embedders/data/blood_cell_scrna/adata_lymphoma.h5ad")