# install packages

## install Spectra from pypi

In [1]:
%pip install scSpectra

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# import relevant packages

In [1]:
#import packages
import numpy as np
import json
import scanpy as sc
from collections import OrderedDict
import scipy
import pandas as pd
import matplotlib.pyplot as plt

#spectra imports
import Spectra as spc
from Spectra import Spectra_util as spc_tl
from Spectra import K_est as kst
from Spectra import default_gene_sets


# Load gene_set_dictionary

**load a nested dictionary containing global and cell type specific gene sets in the following format:**

Gene set annotation dictionary with the keys being the celltypes (str) and values being dictionaries with gene set names as keys (str) and gene sets as values (lists of gene names/IDs which matches the gene names/IDs in adata.var_names).

For example:

```
gene_set_dictionary = {'celltype_1':{'gene_set_1':['gene_a', 'gene_b', 'gene_c'], 'gene_set_2':['gene_c','gene_a','gene_e','gene_f']},

'celltype_2':{'gene_set_1':['gene_a', 'gene_b', 'gene_c'], 'gene_set_3':['gene_a', 'gene_e','gene_f','gene_d']},

'celltype_3':{},

'global':"{'gene_set_4':['gene_m','gene_n']} #the global key must be supplied

```

**Note that one key in the dictionary must be 'global' with the corresponding value being a dictionary of gene sets which apply to all cells**

Spectra will use this dictionary to align factors to the input gene sets. Gene sets which apply to only one cell type in the data should be included in the dictionary of that cell type. If a gene sets applies to all cell types in the data, the gene set should be included in the dictionary for 'global'. If a gene set applies to more than one cell type but not all cell types in the data there are two options 1) Include this gene set in each cell type dictionary which will likely result in a separate factor for this gene set in each cell type. OR 2) include this gene set in the 'global' dictionary which will likely result in one factor for this gene set in all cell types. We give additional guidance on the advantages and disadvantages of either approach in the Supplementary Methods of the Spectra paper: https://doi.org/10.1101/2022.12.20.521311

**Load the default dictionary**
We used this dictionary to generate the results in the paper: https://doi.org/10.1101/2022.12.20.521311

In [2]:
# load the default gene set dictionary from the Spectra paper:
annotations = spc.default_gene_sets.load()
set(annotations.keys())

{'B_GC',
 'B_memory',
 'B_naive',
 'CD4_T',
 'CD8_T',
 'DC',
 'ILC3',
 'MDC',
 'NK',
 'Treg',
 'gdT',
 'global',
 'mast',
 'pDC',
 'plasma'}

In [3]:
annotations['gdT']

{}

In [4]:
[key for key, value in annotations.items() if isinstance(value, dict) and bool(value)]

['B_memory', 'CD4_T', 'CD8_T', 'DC', 'MDC', 'Treg', 'mast', 'pDC', 'global']

# Custom gene_set_dictionary

**Alternatively you can use Spectra with your custom gene set annotation dictionaries.**

Cytopus Knowledge Base

We supply the Cytopus knowledge base to construct custom input gene set dictionaries for Spectra. For a tutorial visit the github repository: https://github.com/wallet-maker/cytopus

**External Databases**

To obtain optimal results, we recommend using dedicated gene sets, like from the Cytopus database which are tailored to single cell RNA sequencing data (see Supplementary Methods for details https://doi.org/10.1101/2022.12.20.521311).

However, Spectra can also use gene set annotation dictionaries from external databases if they can be provided in the format described above.

# Load adata

In the Spectra paper we use a subsetted, anonymized, log1p-transformed (f(x) = ln(x+1), with x being the raw gene expression counts), and scran-normalized leukocyte single cell RNA sequencing data (https://doi.org/10.1101/2022.12.20.521311). While we recommend scran for leukocyte data, the method with log1p-transformed median library size normalized RNA sequencing data.

In [5]:
# define data paths
obs_key = 'cell_type_annotations' #indicat the column name for the dataframe in adata.obs where to find the cell type lab

In [6]:
# load adata
adata = spc.sample_data.load()
adata

Loaded sample data: (1000, 6397)


AnnData object with n_obs × n_vars = 1000 × 6397
    obs: 'cell_type_annotations'
    var: 'n_cells_by_counts', 'highly_variable'
    uns: 'annotation_SPADE_1_colors', 'cell_type_annotations_colors', 'diffmap_evals', 'draw_graph', 'hvg', 'neighbors', 'pca'
    obsm: 'X_diffmap', 'X_draw_graph_fa', 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [10]:
sum(adata.var['highly_variable'])

3000

In [19]:
# alternatively indicate where to find the AnnData object (and uncomment below)
adata_path = '/content/spectra/data/sample_data.h5ad'
# adata = sc.read_h5ad(adata_path)

**Important: The cell type labels have to match with the cell type labels in the gene set dictionary**

In [20]:
# cell type labels in adata
list(set(adata.obs[obs_key]))

['B_GC',
 'gdT',
 'CD8_T',
 'NK',
 'DC',
 'B_naive',
 'Treg',
 'ILC3',
 'mast',
 'pDC',
 'B_memory',
 'CD4_T',
 'MDC',
 'plasma']

In [21]:
# cell type in gene set annotation dictionary
list(set(annotations.keys()))

['B_GC',
 'gdT',
 'CD8_T',
 'NK',
 'DC',
 'B_naive',
 'Treg',
 'global',
 'ILC3',
 'mast',
 'pDC',
 'B_memory',
 'CD4_T',
 'MDC',
 'plasma']

**if labels do not match adjust the labels either in adata or in annotations**

For example define and map a dictionary mapping the annotations in the adata object to the keys in the gene set annotations dictionary or vice versa. **Note that if you have a cell type in your adata for which you do not have any gene sets in your gene set annotation dictionary you must include an empty dictionary under that cell type key.** Afterwards confirm that the matching has worked:

In [22]:
#filter gene set annotation dict for genes contained in adata
annotations = spc_tl.check_gene_set_dictionary(
    adata,
    annotations,
    obs_key='cell_type_annotations',
    global_key='global')

Cell type labels in gene set annotation dictionary and AnnData object are identical
Your gene set annotation dictionary is now correctly formatted.


# Fit Spectra model

**General comments**

We start by importing spectra. The easiest way to run spectra is to use the `est_spectra` function in the `spectra` module, as shown below. The default behavior is to set the number of factors equal to the number of gene sets plus one. However, this can be modified by passing an integer e.g. `L = 20` as an argument to the function or a dictionary that maps cell type to an integer per cell type. We provide a method for estimating the number of factors directly from the data by bulk eigenvalue matching analysis, which is detailed further below.

**Computational comments**

Spectra is memory and cpu intensive.

The here-provided dataset can be run on most computers. For bigger datasets you want to run this in non-interactive mode, as runtime can be 1-2 days for a dataset of ca 200,000 cells with 250 factors and 10 cell types. Run time scales linearly with the number of cell types in your data.

An example hardware you could run a 200,000 cell, 10 cell type, 250 factor dataset on would be:

16 cpus Intel Xeon Gold 256 GB RAM

**Parameters**

`adata` : AnnData object containing cell_type_key with log count data stored in .X

`gene_set_dictionary`:dict or OrderedDict() maps cell types to gene set names to gene sets ; if use_cell_types == False then maps gene set names to gene sets ; must contain "global" key in addition to every unique cell type under .obs.<cell_type_key>

`L`: dict, OrderedDict(), int , NoneType number of factors per cell type ; if use_cell_types == False then int. Else dictionary. If None then match factors to number of gene sets (recommended)

`use_highly_variable` : bool if True, then uses highly_variable_genes

`cell_type_key`: str cell type key, must be under adata.obs.<cell_type_key> . If use_cell_types == False, this is ignored

`use_weights`: bool if True, edge weights are estimated based on graph structure and used throughout training

`lam`: float lambda parameter of the model. weighs relative contribution of graph and expression loss functions

`delta`: float delta parameter of the model. lower bounds possible gene scaling factors so that maximum ratio of gene scalings cannot be too large

`kappa`: float or None if None, estimate background rate of 1s in the graph from data

`rho`: float or None if None, estimate background rate of 0s in the graph from data

`use_cell_types` : bool if True then cell type label is used to fit cell type specific factors. If false then cell types are ignored

`n_top_vals` : int number of top markers to return in markers dataframe

`determinant_penalty` : float determinant penalty of the attention mechanism. If set higher than 0 then sparse solutions of the attention weights and diverse attention weights are encouraged. However, tuning is crucial as setting too high reduces the selection accuracy because convergence to a hard selection occurs early during training [todo: annealing strategy]

`filter_sets` : bool whether to filter the gene sets based on coherence

`label_factors` : bool whether to label the factors by their cell type specificity and their Szymkiewicz–Simpson overlap coefficient with the input marker genes

`overlap_threshold`: float minimum overlap coefficient to assign an input gene set label to a factor

``**kwargs`` : (num_epochs = 10000, lr_schedule = [...], verbose = False) arguments to .train(), maximum number of training epochs, learning rate schedule and whether to print changes in learning rate

**Returns**: SPECTRA_Model object [after training]

**In place**: adds 1. factors, 2. cell scores, 3. vocabulary, and 4. markers as attributes in .obsm, .var, .uns

**default parameters:**

```
est_spectra(
    adata,
    gene_set_dictionary,
    L = None,
    use_highly_variable = True,
    cell_type_key = None,
    use_weights = True,
    lam = 0.01,
    delta=0.001,
    kappa = None,
    rho = 0.001,
    use_cell_types = True,
    n_top_vals = 50,
    filter_sets = True,
    label_factors=True,
    overlap_threshold= 0.2,
    **kwargs)
```

**fit model with cell type annotations**

Running Spectra with the cell type annotations will give you both global and cell type specific factors.

In [23]:
import psutil
import os

process = psutil.Process(os.getpid())
print(f"Memory usage: {process.memory_info().rss / 1024**3:.2f} GB")

Memory usage: 10.47 GB


In [25]:
# fit the model (We will run this with only 2 epochs to decrease runtime in this tutorial)
model = spc.est_spectra(adata=adata,
    gene_set_dictionary=annotations,
    use_highly_variable=True,
    cell_type_key="cell_type_annotations",
    use_weights=True,
    lam=0.1, # varies depending on data and gene sets, try between 0.5 and 0.001
    delta=0.001,
    kappa=None,
    rho=0.001,
    use_cell_types=True,
    n_top_vals=50,
    label_factors=True,
    overlap_threshold=0.2,
    clean_gs = True,
    min_gs_num = 3,
    num_epochs=2 #here running only 2 epochs for time reasons, we recommend 10,000 epochs for most datasets
)


Cell type labels in gene set annotation dictionary and AnnData object are identical
Your gene set annotation dictionary is now correctly formatted.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:32<00:00, 16.28s/it]


In [27]:
process = psutil.Process(os.getpid())
print(f"Memory usage: {process.memory_info().rss / 1024**3:.2f} GB")

Memory usage: 14.66 GB


In [10]:
#explore eta parameter to detect new factors
model.return_eta_diag()

array([5.08050025e-01, 7.21948802e-01, 4.90091026e-01, 8.47066820e-01,
       7.60907412e-01, 5.67253172e-01, 5.43871880e-01, 6.72297478e-01,
       7.43253589e-01, 8.55967522e-01, 7.22362638e-01, 8.63387465e-01,
       8.02784383e-01, 1.57174468e-01, 3.11313123e-01, 4.56129581e-01,
       7.62333333e-01, 7.07589090e-01, 7.57226288e-01, 8.15425932e-01,
       8.94356191e-01, 6.35477066e-01, 3.12536806e-01, 9.36962366e-01,
       6.84281170e-01, 9.00267303e-01, 9.52929020e-01, 8.66900027e-01,
       7.52293825e-01, 5.24942696e-01, 7.44166613e-01, 9.58937883e-01,
       9.18640077e-01, 8.27996492e-01, 5.31382799e-01, 9.18086648e-01,
       5.77133596e-01, 5.91037571e-01, 4.61083472e-01, 8.75263512e-01,
       1.41178221e-01, 4.15463328e-01, 4.97399598e-01, 9.39325750e-01,
       7.18635798e-01, 6.96919501e-01, 7.29688883e-01, 6.39705956e-01,
       7.57886350e-01, 4.31670398e-01, 7.66722441e-01, 7.66559660e-01,
       7.80520380e-01, 8.11941683e-01, 9.31419015e-01, 7.97451854e-01,
      

## Accessing quantities stored in AnnData

This function stores four important quantities in the AnnData, in addition to returning a fitted model object. *Factors* are the scores that tell you how much each gene contributes to each factor:

In [1]:
#find the factors under adata.uns['SPECTRA_factors']
adata.uns['SPECTRA_factors']

NameError: name 'adata' is not defined

In [12]:
#so you can construct a dataframe for the factor gene weights

#include cell type specificity as a prefix into the index
index_labels = adata.uns['SPECTRA_overlap'].index
gene_weights = pd.DataFrame(adata.uns['SPECTRA_factors'],
                            index= index_labels,
                            columns=adata.var[adata.var['spectra_vocab']].index)
gene_weights

Unnamed: 0,KCNC3,NDUFS3,HACD1,FAM214A,TBCD,AC124312.1,PLAGL2,CD40LG,AC022182.1,USP53,...,CCL3L1,BAG3,CDC42EP3,TRAM1,RANBP1,FDFT1,HACD4,FCGR3A,AZGP1,SLC35A1
0-X-global-X-all_biotin_metabolism,1.829848e-13,1.354706e-12,1.850284e-12,8.417505e-14,6.022228e-13,1.341601e-12,1.521034e-13,7.123538e-13,9.071535e-14,1.483899e-13,...,9.063715e-13,1.680384e-13,3.072371e-14,1.281843e-13,2.738279e-14,4.227562e-14,2.155471e-13,9.137648e-13,9.058629e-14,4.012736e-13
1-X-global-X-all_purine_synthesis,7.044687e-14,3.620205e-13,1.043178e-12,4.966001e-14,6.087257e-13,3.878806e-13,1.980513e-13,3.196440e-13,2.039438e-13,1.532621e-13,...,4.719931e-13,4.562609e-14,2.217797e-14,6.816611e-14,3.407273e-13,1.652549e-14,3.832881e-13,1.021240e-13,5.866443e-14,7.688962e-13
2-X-global-X-all_ethanol_metabolism,1.387129e-13,7.518807e-13,4.468089e-13,1.925834e-14,2.588696e-13,3.919322e-13,1.385020e-13,2.338276e-13,1.108466e-13,2.098785e-13,...,2.422819e-13,3.054713e-14,2.979821e-14,1.097012e-13,8.717986e-13,1.794301e-14,4.643829e-14,1.043277e-13,4.354468e-14,6.349619e-13
3-X-global-X-all_amino-sugar-nucleotide-sugar_metabolism,5.166444e-14,6.396495e-13,8.771149e-13,3.831221e-13,5.777056e-13,8.774198e-13,6.498570e-14,2.875504e-12,1.641813e-13,6.634824e-14,...,5.292872e-13,1.656257e-14,1.860625e-14,2.519246e-13,5.749609e-14,1.807799e-13,8.044766e-14,3.016330e-13,4.203303e-14,1.828774e-13
4-X-global-X-all_steroid_metabolism,2.016960e-13,1.213367e-12,1.984178e-13,2.897571e-14,3.696923e-12,2.947184e-13,1.589094e-13,5.646609e-13,8.689617e-14,1.765750e-13,...,1.685796e-13,8.110229e-14,3.821513e-14,2.414952e-13,9.924358e-14,3.644782e-03,3.243228e-13,5.970900e-14,6.498356e-14,6.884895e-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191-X-mast-X-mast_granule-exocytosis,8.629393e-13,2.993837e-13,2.042822e-13,1.531111e-12,2.311560e-13,1.377747e-13,1.088774e-12,8.808455e-14,2.475420e-12,6.280637e-13,...,2.106143e-12,7.374496e-13,6.372828e-13,4.556392e-13,9.854206e-12,2.667835e-13,1.865654e-12,8.283998e-13,1.879116e-13,1.127246e-13
192-X-mast-X-192,2.978293e-02,4.587977e-02,5.075551e-02,1.032303e-01,4.875359e-02,3.428379e-02,5.405523e-02,3.881575e-02,6.792689e-02,2.279638e-02,...,3.129243e-02,3.298770e-02,1.995946e-02,4.044693e-02,1.092616e-01,1.754598e-02,9.980147e-02,7.929787e-02,4.552150e-02,3.851634e-02
193-X-pDC-X-p-DC_CpG-TLR9_response,3.706794e-13,6.026678e-13,1.673973e-12,5.105730e-13,1.921424e-12,3.427358e-13,5.680241e-14,2.397605e-13,7.636605e-13,1.033331e-12,...,3.937993e-13,5.433372e-13,2.354694e-13,3.693978e-13,3.641152e-13,2.529311e-13,5.443620e-12,6.472829e-13,4.379445e-13,1.575053e-12
194-X-pDC-X-194,1.096226e-01,3.588844e-02,4.719894e-02,2.793491e-02,8.793464e-02,5.192929e-02,2.236182e-02,4.528513e-02,5.561669e-02,2.949215e-02,...,3.503251e-02,4.375657e-02,9.300604e-02,4.302303e-02,5.694228e-02,1.010350e-01,1.276329e-01,2.293796e-02,6.519804e-02,3.447067e-02


*Markers* is an array of genes with top scores for every factor:

In [13]:
#find an array of the marker genes per factor here:
adata.uns['SPECTRA_markers']

array([['HLCS', 'BTD', 'SLC5A6', ..., 'RTN1', 'LDHAL6A', 'MARK2'],
       ['AMPD2', 'GMPR', 'APRT', ..., 'GAMT', 'CYTH2', 'ANGPTL4'],
       ['AOX1', 'ALDH1B1', 'ALDH3B2', ..., 'EXOSC1', 'PPP4R2', 'OSTC'],
       ...,
       ['TCF7', 'OTULINL', 'RNF145', ..., 'ABCA2', 'DOCK7', 'ZDBF2'],
       ['CISH', 'AP002387.2', 'RBP7', ..., 'STT3B', 'TGFB2', 'TIAM2'],
       ['SLC43A2', 'ETFBKMT', 'FOXA1', ..., 'HSPA6', 'IL4R', 'TLR7']],
      dtype=object)

We also provide an approach to label the factors by their Szymkiewicz–Simpson overlap coefficient with the input gene sets. Each factors receives the label of the input gene set with the highest overlap coefficient, given that it the overlap coefficient is greater than the threshold defined in 'overlap_threshold'. Ties in the overlap coefficient by gene set size, selecting the label of the bigger gene set (because smaller gene sets might get bigger overlap coefficients by chance).

We provide a pandas.DataFrame indicating the overlap coefficients for each input gene set with each factor's marker genes. The index of this dataframe contains the *index* of each factor, *assigned label* as well as the *cell type specificity* for each factor in the format:

`['index' + '-X-' + 'cell type specificity' + '-X-' + 'assigned label', ...]`

We use `'-X-'` as a unique seperator to make string splitting and retrieval of the different components of the index easier.

In [14]:
adata.uns['SPECTRA_overlap']

Unnamed: 0,B_Breg_UP,B_effector-2_UP,B_effector-1_UP,B_IgM-ligation_response,CD4-T_IL12_response,TNK_IL2_response,CD4-T_TH22_UP,CD4-T_TH17_UP,TNK_IL2-STAT5-signaling,T_IL21_response,...,all_thiamin_metabolism,all_NOTCH_signaling,all_galactose_metabolism,all_MYC_targets,all_GLU_metabolism,all_fatty-acid_synthesis,all_glycolysis,all_glutathione_metabolism,all_IL6-JAK-STAT3_signaling,all_autophagy-chaperone-mediated
0-X-global-X-all_biotin_metabolism,0.0,0.0,0.000,0.04,0.000000,0.055556,0.000000,0.0,0.02,0.0,...,0.0,0.058824,0.000000,0.06,0.000000,0.000000,0.083333,0.000000,0.02,0.0
1-X-global-X-all_purine_synthesis,0.0,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.06,0.0,...,0.0,0.000000,0.000000,0.02,0.000000,0.000000,0.000000,0.000000,0.00,0.0
2-X-global-X-all_ethanol_metabolism,0.0,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.00,0.0,...,0.0,0.029412,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.02,0.0
3-X-global-X-all_amino-sugar-nucleotide-sugar_metabolism,0.0,0.0,0.000,0.02,0.000000,0.000000,0.000000,0.0,0.06,0.0,...,0.0,0.000000,0.055556,0.00,0.000000,0.041667,0.041667,0.000000,0.04,0.0
4-X-global-X-all_steroid_metabolism,0.0,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.00,0.0,...,0.0,0.000000,0.000000,0.00,0.000000,0.083333,0.000000,0.000000,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191-X-mast-X-mast_granule-exocytosis,0.0,0.0,0.000,0.00,0.076923,0.000000,0.000000,0.0,0.00,0.0,...,0.0,0.000000,0.000000,0.00,0.027027,0.000000,0.000000,0.047619,0.00,0.0
192-X-mast-X-192,0.0,0.0,0.000,0.00,0.000000,0.000000,0.166667,0.0,0.06,0.0,...,0.0,0.000000,0.000000,0.00,0.027027,0.000000,0.000000,0.000000,0.02,0.0
193-X-pDC-X-p-DC_CpG-TLR9_response,0.0,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.04,0.0,...,0.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.02,0.0
194-X-pDC-X-194,0.2,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.06,0.0,...,0.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.02,0.0


*Cell scores* are similarly the score of each factor for every cell:

In [15]:
#find an array of cell scores per factor and cell here:
adata.obsm['SPECTRA_cell_scores']

array([[5.92012661e-05, 6.09113287e-06, 4.29270923e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.07278118e-05, 7.59219680e-06, 3.56601400e-06, ...,
        0.00000000e+00, 0.00000000e+00, 1.06560870e-01],
       [3.44234737e-05, 1.29496671e-05, 2.16738783e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.38164069e-04, 2.07450387e-05, 1.47272816e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.04828960e-06, 6.67606181e-06, 3.03662996e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.05452227e-05, 8.67253067e-06, 1.45520775e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

Vocab is a boolean array that is `True` for genes that were used while fitting the model - note that this quantity is only added to the AnnData when `highly_variable` is set to `True`:

In [16]:
#find the vocab here:
adata.var['spectra_vocab']

KCNC3      True
NDUFS3     True
HACD1      True
FAM214A    True
TBCD       True
           ... 
FDFT1      True
HACD4      True
FCGR3A     True
AZGP1      True
SLC35A1    True
Name: spectra_vocab, Length: 6397, dtype: bool

It also includes a dictionary of the factor numbers per cell type:

In [17]:
#find the factor numbere dict here:
adata.uns['SPECTRA_L']

{'B_GC': 1,
 'B_memory': 5,
 'B_naive': 1,
 'CD4_T': 12,
 'CD8_T': 7,
 'DC': 3,
 'ILC3': 1,
 'MDC': 6,
 'NK': 1,
 'Treg': 2,
 'gdT': 1,
 'mast': 2,
 'pDC': 2,
 'plasma': 1,
 'global': 151}

## Acessing model parameters

To access finer grained information about the model fit, we can look at the attributes of the model object directly. Model parameters can be accessed with functions associated with the model object

In [18]:
#this is the model file
dir(model)

['B_diag',
 'L',
 '_SPECTRA_Model__B_diag',
 '_SPECTRA_Model__eta_matrices',
 '_SPECTRA_Model__store_parameters',
 '_SPECTRA_Model__store_parameters_no_celltypes',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'cell_scores',
 'delta',
 'eta_matrices',
 'factors',
 'gene_scalings',
 'initialize',
 'internal_model',
 'kappa',
 'lam',
 'load',
 'matching',
 'return_cell_scores',
 'return_eta',
 'return_eta_diag',
 'return_factors',
 'return_gene_scalings',
 'return_graph',
 'return_kappa',
 'return_rho',
 'rho',
 'save',
 'train',
 'use_cell_types']

In [19]:
#e.g. find the lambda parameter the model was trained with:
model.lam

0.1

Apart from cell scores and factors, we can also retrive a number of other parameters this way that are not by default added to the AnnData. Eta diag is the diagonal of the fitted factor-factor interaction matrix; however, its interpretation is that it measures the extent to which each factor is influenced by the prior information. In practice many of these values are zero, indicating that they are estimated without bias introduced by the annotation set. Eta is the full set of factor-factor interaction matrices, whose off diagonals measure the extent to which factors share the same genes. Rho and kappa are parameters that control the background rate of non-edges and edges respectively. These can be fixed throughout training (default) or estimated from the data by providing rho = None or kappa = None to the est_spectra() function or to model.train(). Finally gene scalings are correction factors that normalize each gene based on its mean expression value.

In [20]:
model.return_eta_diag()

array([5.08050025e-01, 7.21948802e-01, 4.90091026e-01, 8.47066820e-01,
       7.60907412e-01, 5.67253172e-01, 5.43871880e-01, 6.72297478e-01,
       7.43253589e-01, 8.55967522e-01, 7.22362638e-01, 8.63387465e-01,
       8.02784383e-01, 1.57174468e-01, 3.11313123e-01, 4.56129581e-01,
       7.62333333e-01, 7.07589090e-01, 7.57226288e-01, 8.15425932e-01,
       8.94356191e-01, 6.35477066e-01, 3.12536806e-01, 9.36962366e-01,
       6.84281170e-01, 9.00267303e-01, 9.52929020e-01, 8.66900027e-01,
       7.52293825e-01, 5.24942696e-01, 7.44166613e-01, 9.58937883e-01,
       9.18640077e-01, 8.27996492e-01, 5.31382799e-01, 9.18086648e-01,
       5.77133596e-01, 5.91037571e-01, 4.61083472e-01, 8.75263512e-01,
       1.41178221e-01, 4.15463328e-01, 4.97399598e-01, 9.39325750e-01,
       7.18635798e-01, 6.96919501e-01, 7.29688883e-01, 6.39705956e-01,
       7.57886350e-01, 4.31670398e-01, 7.66722441e-01, 7.66559660e-01,
       7.80520380e-01, 8.11941683e-01, 9.31419015e-01, 7.97451854e-01,
      

In [21]:
adata.uns['SPECTRA_overlap']

Unnamed: 0,B_Breg_UP,B_effector-2_UP,B_effector-1_UP,B_IgM-ligation_response,CD4-T_IL12_response,TNK_IL2_response,CD4-T_TH22_UP,CD4-T_TH17_UP,TNK_IL2-STAT5-signaling,T_IL21_response,...,all_thiamin_metabolism,all_NOTCH_signaling,all_galactose_metabolism,all_MYC_targets,all_GLU_metabolism,all_fatty-acid_synthesis,all_glycolysis,all_glutathione_metabolism,all_IL6-JAK-STAT3_signaling,all_autophagy-chaperone-mediated
0-X-global-X-all_biotin_metabolism,0.0,0.0,0.000,0.04,0.000000,0.055556,0.000000,0.0,0.02,0.0,...,0.0,0.058824,0.000000,0.06,0.000000,0.000000,0.083333,0.000000,0.02,0.0
1-X-global-X-all_purine_synthesis,0.0,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.06,0.0,...,0.0,0.000000,0.000000,0.02,0.000000,0.000000,0.000000,0.000000,0.00,0.0
2-X-global-X-all_ethanol_metabolism,0.0,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.00,0.0,...,0.0,0.029412,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.02,0.0
3-X-global-X-all_amino-sugar-nucleotide-sugar_metabolism,0.0,0.0,0.000,0.02,0.000000,0.000000,0.000000,0.0,0.06,0.0,...,0.0,0.000000,0.055556,0.00,0.000000,0.041667,0.041667,0.000000,0.04,0.0
4-X-global-X-all_steroid_metabolism,0.0,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.00,0.0,...,0.0,0.000000,0.000000,0.00,0.000000,0.083333,0.000000,0.000000,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191-X-mast-X-mast_granule-exocytosis,0.0,0.0,0.000,0.00,0.076923,0.000000,0.000000,0.0,0.00,0.0,...,0.0,0.000000,0.000000,0.00,0.027027,0.000000,0.000000,0.047619,0.00,0.0
192-X-mast-X-192,0.0,0.0,0.000,0.00,0.000000,0.000000,0.166667,0.0,0.06,0.0,...,0.0,0.000000,0.000000,0.00,0.027027,0.000000,0.000000,0.000000,0.02,0.0
193-X-pDC-X-p-DC_CpG-TLR9_response,0.0,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.04,0.0,...,0.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.02,0.0
194-X-pDC-X-194,0.2,0.0,0.000,0.00,0.000000,0.000000,0.000000,0.0,0.06,0.0,...,0.0,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.02,0.0


Examine the parameters of the underlying internal model:

In [22]:
model.internal_model

SPECTRA(
  (theta): ParameterDict(
      (global): Parameter containing: [torch.FloatTensor of size 6392x151]
      (B_GC): Parameter containing: [torch.FloatTensor of size 6392x1]
      (B_memory): Parameter containing: [torch.FloatTensor of size 6392x5]
      (B_naive): Parameter containing: [torch.FloatTensor of size 6392x1]
      (CD4_T): Parameter containing: [torch.FloatTensor of size 6392x12]
      (CD8_T): Parameter containing: [torch.FloatTensor of size 6392x7]
      (DC): Parameter containing: [torch.FloatTensor of size 6392x3]
      (ILC3): Parameter containing: [torch.FloatTensor of size 6392x1]
      (MDC): Parameter containing: [torch.FloatTensor of size 6392x6]
      (NK): Parameter containing: [torch.FloatTensor of size 6392x1]
      (Treg): Parameter containing: [torch.FloatTensor of size 6392x2]
      (gdT): Parameter containing: [torch.FloatTensor of size 6392x1]
      (mast): Parameter containing: [torch.FloatTensor of size 6392x2]
      (pDC): Parameter containing:

In [23]:
#You can save the model like this (this way consumes a lot of storage but does not require the model parameters to load)
import pickle
with open('spectra_model.pickle', 'wb') as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

KeyboardInterrupt: 

In [None]:
#and load it like this:
with open('spectra_model.pickle', 'rb') as f:
    model = pickle.load(f)

In [None]:
#this way needs less storage but requires the original adata, annotations and cell type annotations to load the model again
model.save('spectra_model_compact')

In [None]:
#load the model (you will need all the parameters indicated here)
model = spc.load_from_pickle('spectra_model_compact',adata,gs_dict=annotations,cell_type_key='cell_type_annotations',
                     )

**fit model without cell type annotations**

You can also fit the model without using the cell type annotations. In this case, instead of a nested gene set annotation dictionary, supply a regular dictionary with gene set names as keys (str) and the gene sets as values (list of gene names/IDs which match the gene names/IDs in the adata.var_names)

In [None]:
{'global':annotations['global']}

In [None]:
#fit the model (We will run this with only 2 epochs to decrease runtime in this tutorial)
model_global = spc.est_spectra(adata = adata, gene_set_dictionary = annotations['global'], # because we do not use the cell types
                                                                                    # we will supply a regular dict
                                                                                    # instead of the nested dict above
                        use_highly_variable = True, cell_type_key =None, #"cell_type_annotations" ,
                        use_weights = True, lam = 0.1,
                        delta=0.001,kappa = 0.00001, rho = 0.00001,
                        use_cell_types = False, #set to False to not use the cell type annotations
                        n_top_vals = 25,
                        clean_gs=True,
                        label_factors=True,
                        num_epochs=2 #for demonstration purposes we will only run 2 epochs, we recommend 10,000 epochs
                       )



In [None]:
#show the labeled factors in the data:
adata.uns['SPECTRA_overlap'].index

In [None]:
#visualize factor cell scores (this is poorly fitted bc we only ran 2 epochs)
factor_of_interest = adata.uns['SPECTRA_overlap'].index[80]
print('plotting factor:',adata.uns['SPECTRA_overlap'].index[80])

#add cell scores to obs
cell_scores = adata.obsm['SPECTRA_cell_scores'][:,0].astype(float)
adata.obs[factor_of_interest] = cell_scores
sc.pl.umap(adata,color=factor_of_interest,s=30,vmax=np.quantile(cell_scores,0.98))

In [None]:
#save the adata
adata_save_path = 'adata_spectra.h5ad'#where to save the adata to
adata.write(adata_save_path)
print('Saved adata to:',adata_save_path)

# Accessing the fitted gene-gene graph

One outcome of fitting spectra is to fit a gene-gene graph where edges represent similarities between latent variables associated with each gene (a smoothed version of transcriptional similarity) To access this for say, TNK cells; use the following

In [None]:
soft_graph = model_global.return_graph(ct = "global")
soft_graph

for large numbers of genes its clumsy to visualize the whole graph - to visualize a subgraph formed around a particular list of genes, use:

In [None]:
gene_set = list(adata.uns['SPECTRA_markers'][0])
out = spc.graph_network(adata, soft_graph, gene_set)

#this will not show in Google Colaboratory but you can open the file manually outside Google Colaboratory
#out.show("test_graph.html")

In [None]:
adata.uns['SPECTRA_markers'][0].shape

this will take N closest genes to your gene set and only visualize this subgraph. The interactive graph file gets saved as an html. To visualize multiple gene sets at the same time, we have a different version of the function that assigns a random color to each gene set:

In [None]:
#gene sets is a list of lists
gene_set_2 = list(adata.uns['SPECTRA_markers'][1])
gene_sets = [gene_set,gene_set_2]
out = spc.graph_network_multiple(adata,soft_graph, gene_sets)
#out.show("test_graph.html")

# Fitting the model without AnnData

Instead of passing an AnnData object to est_spectra one can pass np.ndarray objects directly. The **kwargs contains arguments to the training function, lr_schedule = [1.0,.5,.1,.01,.001,.0001],num_epochs = 10000, verbose = False. To do this, initialize a model:

In [None]:
adata

In [None]:
#define number of factors (only est_spectra has the option to automatically set the factor numbers per celltype)

L = {}
for key in annotations.keys():
    length = len(list(annotations[key].values()))
    L[key] = length + 1
L

In [None]:
#get the genes you want to use (for simplicity we will copy from the adata above)
vocab = list(adata.var_names)

In [None]:
adata.X.todense().shape

In [None]:
len(vocab)

In [None]:
model = spc.SPECTRA_Model(X = adata.X.todense(), L=L, labels = np.array(adata.obs['cell_type_annotations']),
                          gs_dict = annotations,lam = 0.01, vocab=vocab, use_weights=True,
                          delta=0.001,kappa = None, rho = 0.05,use_cell_types=True)
model.train(X = adata.X.todense(), labels = np.array(adata.obs['cell_type_annotations']),
            num_epochs=2) #we will only do 2 epochs here for demonstration purposes. We recommend 10,000 epochs for most datasets.

It is also required to run the model this way if you want to input arbitrary adjacency matrices instead of a dictionary of gene sets. The gene set dictionary is used to create an adjacency matrix when it is not None.

In [None]:
#you will need the cell type containing model and vocab again --> for this tutorial we will retrain
model = spc.est_spectra(adata = adata, gene_set_dictionary = annotations,
                        use_highly_variable = True, cell_type_key = "cell_type_annotations",
                        use_weights = True, lam = 0.1,
                        delta=0.001,kappa = 0.00001, rho = 0.00001,
                        use_cell_types = True, n_top_vals = 25,
                        label_factors = True, #whether to label the factors by their overlap coefficient with the input gene sets
                        overlap_threshold = 0.2, #minimum overlap coefficient that has to be surpassed to assign a label to a factor
                        num_epochs=2 #for demonstration purposes we will only run 2 epochs, we recommend 10,000 epochs
                       )

In [None]:
#get an example adjacency matrix (the threshold should be properly determined based on the question)
threshold = 0.15
cell_types = list(set(adata.obs['cell_type_annotations'])) + ['global']
adj_matrix = {cell_type : (model.return_graph(ct = cell_type) > threshold).astype(float) for cell_type in cell_types}
adj_matrix

In [None]:
dir(model)

In [None]:
soft_graph

In [None]:
adata.X.todense().shape

In [None]:
adata.var['spectra_vocab'].value_counts()

In [None]:
#train model with adjacency matrix
vocab = np.array(adata.var['spectra_vocab'][adata.var['spectra_vocab']].index)
X = adata.X.todense().T[adata.var['spectra_vocab']].T

model = spc.SPECTRA_Model(X = X, labels =  np.array(adata.obs['cell_type_annotations']),
                          L = L, adj_matrix = adj_matrix, weights = None, lam = 0.01,  delta=0.001,kappa = None, rho = 0.05,
                          use_cell_types = True)
model.train(X = X, labels =  np.array(adata.obs['cell_type_annotations']),
            num_epochs=2) #we will only do 2 epochs here for demonstration purposes. We recommend 10,000 epochs for most datasets.)

