In [72]:

# usage
# python scripts/process/regression_model.py --output cellbender
# python scripts/process/regression_model.py --output cellranger

import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
from numpy.random import default_rng
import os
import matplotlib.pyplot as plt
from pathlib import Path
import re

# this line forces theano to use the GPU and should go before importing cell2location
os.environ["THEANO_FLAGS"] = 'device=cuda0,floatX=float32,force_device=True'

import argparse

from easydict import EasyDict as edict

In [73]:
args = edict({"output": "cellbender"})
args = edict({"output": "cellranger"})
sample_id = "sample_id"
args

{'output': 'cellranger'}

In [74]:
# set up relative paths within the project
current_folder = globals()['_dh'][0]
if args.output == "cellbender":
    adata_annotated = sc.read_h5ad(current_folder / ".." / ".." / "data" / "prc" / "sc" / "annotated_cellbender_mod.h5ad")
    raw_input_dir = current_folder / ".." / ".." / "data" / "prc" / "sc" / "cellbender"
    samples = [sample for sample in os.listdir(raw_input_dir) if not sample.startswith(".")]
    adata_objects = {}
    for sample in samples:
        adata = sc.read_10x_h5(raw_input_dir / sample / "cell_bender_matrix_filtered.h5")
        adata.var_names_make_unique()
        adata.obs_names = [f"{sample}_{cell}" for cell in adata.obs_names]
        adata_objects[sample] = adata
    adata_raw = sc.concat(list(adata_objects.values()), join="outer", label=sample_id, keys=list(adata_objects.keys()))
    del adata_objects
    output_dir = current_folder / ".." / ".." / "data" / "prc" / "sc" / "c2l_model" / "cellbender"

elif args.output == "cellranger":
    adata_annotated = sc.read_h5ad(current_folder / ".." / ".." / "data" / "prc" / "sc" / "annotated_cellranger_mod.h5ad")
    raw_input_dir = current_folder / ".." / ".." / "data" / "raw" / "sc"
    samples = [sample for sample in os.listdir(raw_input_dir) if not sample.startswith(".")]
    adata_objects = {}
    for sample in samples:
        adata = sc.read_10x_h5(raw_input_dir / sample / "filtered_feature_bc_matrix.h5")
        adata.var_names_make_unique()
        adata.obs_names = [f"{sample}_{cell}" for cell in adata.obs_names]
        adata_objects[sample] = adata
    adata_raw = sc.concat(list(adata_objects.values()), join="outer", label=sample_id, keys=list(adata_objects.keys()))
    del adata_objects
    output_dir = current_folder / ".." / ".." / "data" / "prc" / "sc" / "c2l_model" / "cellranger"

else:
    raise ValueError("output must be either 'cellbender' or 'cellranger'")
output_dir.mkdir(parents=True, exist_ok=True)

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [75]:
# paste the sample id to the obs names
adata_annotated.obs_names = [f"{sample}_{re.sub('-[0-9]+$', '', cell)}" for sample, cell in zip(adata_annotated.obs[sample_id], adata_annotated.obs_names)]
print(adata_annotated.obs_names[:6])
print(adata_annotated.obs_names[-6:])

Index(['CO37_AAACCCAAGTCTGCGC-1', 'CO37_AAACCCACACCACATA-1',
       'CO37_AAACCCACATTCCTAT-1', 'CO37_AAACCCACATTGTCGA-1',
       'CO37_AAACCCATCTCCGATC-1', 'CO37_AAACCCATCTCTAGGA-1'],
      dtype='object')
Index(['MS586_TTTGGTTTCATTTCCA-1', 'MS586_TTTGTTGAGGTAAGGA-1',
       'MS586_TTTGTTGAGTGACCTT-1', 'MS586_TTTGTTGGTCCACTCT-1',
       'MS586_TTTGTTGGTGTCCTAA-1', 'MS586_TTTGTTGTCTAGAGCT-1'],
      dtype='object')


In [76]:
print(adata_raw.obs_names[:6])
print(adata_raw.obs_names[-6:])

Index(['MS377I_AAACCCAAGTGCAGCA-1', 'MS377I_AAACCCACATGAAGCG-1',
       'MS377I_AAACCCAGTCACCACG-1', 'MS377I_AAACCCATCAGACCGC-1',
       'MS377I_AAACCCATCAGTCATG-1', 'MS377I_AAACGAAAGCACTAAA-1'],
      dtype='object')
Index(['MS497T_TTTGGTTTCAAAGAAC-1', 'MS497T_TTTGGTTTCCGTGTAA-1',
       'MS497T_TTTGTTGCATCGCTGG-1', 'MS497T_TTTGTTGGTCTTTATC-1',
       'MS497T_TTTGTTGGTGGGTCAA-1', 'MS497T_TTTGTTGGTGTTTGCA-1'],
      dtype='object')


In [77]:
# check whether the annotated adata is a subset of the raw adata
assert set(adata_annotated.obs_names).issubset(set(adata_raw.obs_names)), "The annotated adata is not a subset of the raw adata"

In [78]:
sample_meta = pd.read_excel(current_folder / ".." / ".." / "data" / "Metadata_all.xlsx", sheet_name="snRNA-seq")

cond_dict = {
    "MS": sample_meta.sample_id[sample_meta.Condition=="MS"],
    "Control": sample_meta.sample_id[sample_meta.Condition=="Control"],
    "CA": sample_meta.sample_id[sample_meta.lesion_type=="CA"],
    "CI": sample_meta.sample_id[sample_meta.lesion_type=="CI"],
    "A": sample_meta.sample_id[sample_meta.lesion_type=="A"],
}
cond_dict

{'MS': 0      MS197
 1      MS229
 2     MS371N
 3     MS377N
 4     MS377I
 5     MS377T
 6      MS411
 7      MS466
 8     MS497I
 9     MS497T
 10    MS549H
 11    MS549T
 12     MS586
 Name: sample_id, dtype: object,
 'Control': 13    CO37
 14    CO40
 15    CO45
 16    CO41
 17    CO74
 18    CO85
 Name: sample_id, dtype: object,
 'CA': 0     MS197
 1     MS229
 3    MS377N
 4    MS377I
 5    MS377T
 6     MS411
 Name: sample_id, dtype: object,
 'CI': 7      MS466
 8     MS497I
 9     MS497T
 10    MS549H
 11    MS549T
 Name: sample_id, dtype: object,
 'A': 2     MS371N
 12     MS586
 Name: sample_id, dtype: object}

In [79]:
assert set(sample_meta.sample_id) == set(adata_raw.obs[sample_id]), "Samples are missing from the raw adata"
assert set(sample_meta.sample_id) == set(adata_annotated.obs[sample_id]), "Samples are missing from the annotated adata"

In [80]:
# transfer the annotation
adata_raw = adata_raw[adata_annotated.obs_names, :]
adata_raw.obs = adata_annotated.obs
adata_raw

AnnData object with n_obs × n_vars = 131464 × 36601
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_score', 'predicted_doublet', 'diss_score', 'patient_id', 'sample_id', 'Condition', 'lesion_type', 'Age', 'Sex', 'RIN', 'Batch', 'visium', 'snRNA-seq', 'batch', 'leiden', 'cell_types'

In [87]:
# Run one model for each spec
for condition, samples in cond_dict.items():

    print(condition)
    adata = adata_raw[adata_raw.obs[sample_id].isin(samples), :].copy()
    print(adata)
    print(adata.obs.sample_id.unique())
    tmp_out = output_dir / (condition + "_reg_model")
    print(f"Running regression model for {condition}, saving in {tmp_out}")

MS
AnnData object with n_obs × n_vars = 100849 × 36601
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_score', 'predicted_doublet', 'diss_score', 'patient_id', 'sample_id', 'Condition', 'lesion_type', 'Age', 'Sex', 'RIN', 'Batch', 'visium', 'snRNA-seq', 'batch', 'leiden', 'cell_types'
['MS197', 'MS229', 'MS371N', 'MS377I', 'MS377N', ..., 'MS497I', 'MS497T', 'MS549H', 'MS549T', 'MS586']
Length: 13
Categories (13, object): ['MS197', 'MS229', 'MS371N', 'MS377I', ..., 'MS497T', 'MS549H', 'MS549T', 'MS586']
Running regression model for MS, saving in /Users/pschafer/Projects/VisiumMS/scripts/notebooks/../../data/prc/sc/c2l_model/cellranger/MS_reg_model
Control
AnnData object with n_obs × n_vars = 30615 × 36601
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_score', 'predicted_doublet', 'diss_score', 'patient_id', 'sample_id', 'Condition', 'lesion_type', 'Age', 'Sex', 'RIN', 'Batch', 'v