In [None]:
import anndata as ad
import scanpy as sc

import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import torch

from propose import PROPOSE, ExpressionDataset
import tqdm

**Load and process dataset**

In [None]:
# see 00_data_proc.ipynb for details on how the .h5ad file was created
adata = ad.read_h5ad('./VISp_dataset/VISp_filtered.h5ad')

# convert categorical cell type labels to numbers
adata.obs['cell_types_25_codes'] = pd.Categorical(adata.obs['cell_types_25']).codes

# save binarized data in a separate layer
adata.layers['bin'] = (adata.X>0).astype(np.float32)

print(adata)

**Create training and validation splits**

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
seed = 0  # for reproducible behavior
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
inds = list(skf.split(X=np.arange(adata.shape[0]),
                      y=adata.obs['cell_types_25_codes'].values))

fold = 0
train_ind, val_ind = inds[fold]


In [None]:
print(f'{adata.shape[0]} total samples')
print(f'{np.size(train_ind)} in training set')
print(f'{np.size(val_ind)} in validation set')

# These are views, so they do not take up memory
adata_train = adata[train_ind,:]
adata_val = adata[val_ind,:]

**Run PROPOSE**
 - Here the task is to select gene sets that can be used for cell type classification.
 - The reference is a scRNA-seq dataset, and the application might be selecting genes for an mFISH experiment where we want to recover cell type identity.
 - We choose input to be the binarized gene expression values to make results robust to domain shift (scRNA-seq vs. mFISH expression)
 - We use PROPOSE with a cross entropy loss function, which is standard practice to train models for a classification problem. 

In [None]:
# Initialize the dataset for PROPOSE
# Note: Here, data_train.layers['bin'] is a sparse array
# data_train.layers['bin'].A converts it to a dense array
train_dataset = ExpressionDataset(adata_train.layers['bin'].A, adata_train.obs['cell_types_25_codes'])
val_dataset = ExpressionDataset(adata_val.layers['bin'].A, adata_val.obs['cell_types_25_codes'])


# Use GPU device if available -- we highly recommend using a GPU!
device = torch.device(torch.cuda.current_device() if torch.cuda.is_available() else 'cpu')

# Number of genes to select within the current selection process.
num_genes = (250,)
propose_results = {}

# Set up the PROPOSE selector
selector = PROPOSE(train_dataset,
                   val_dataset,
                   loss_fn=torch.nn.CrossEntropyLoss(),
                   device=device,
                   hidden=[128, 128])

selector.labels = np.unique(adata.obs['cell_types_25_codes'].values)

# Coarse removal of genes
print('Starting initial elimination...')
candidates, model = selector.eliminate(target=500, mbsize=128, max_nepochs=50, verbose=False)
print('Completed initial elimination.')



In [None]:
print('Selecting specific number of genes...')
for num in num_genes:
    inds, model = selector.select(num_genes=num, mbsize=128, max_nepochs=50)
    propose_results[num] = inds
print('Done')

In [None]:
model.frac_correct['train'] = {k:v.to('cpu').numpy() for k,v in model.frac_correct['train'].items()}
model.frac_correct['val'] = {k:v.to('cpu').numpy() for k,v in model.frac_correct['val'].items()}
train_df = pd.DataFrame.from_dict(model.frac_correct['train'], orient='index')
val_df = pd.DataFrame.from_dict(model.frac_correct['val'], orient='index')
train_df.rename(columns={0:'train'}, inplace=True)
val_df.rename(columns={0:'val'}, inplace=True)
df = train_df.merge(val_df, how='left', left_index=True, right_index=True)
df

In [None]:
# obtain a copy of features from the anndata object
# Note: Without the .copy(), you will modify adata itself, which may be desirable in some use cases.
df = adata.var.copy()

# set a boolean = True for genes selected in any of the rounds
for num in num_genes:
    df[f'propose_set_{num}'] = False
    ind = df.iloc[propose_results[num]].index
    df.loc[ind,f'propose_set_{num}'] = True

In [None]:
# only keep features (genes) that were selected in any set by propose, and save for subsequent use
df = df[df[[f'propose_set_{num}' for num in num_genes]].any(axis=1)]

df.head(2)

In [None]:
# Plot average binarized expression pattern for selection across cell types:
sc.pl.dotplot(adata,
              var_names=df[df['propose_set_32']].index.values,
              groupby='cell_types_25',
              layer='bin')
plt.show()

**Performance (TBD)**
- The user may be interested in the average expression patterns for the selected genes
- Train an independent classifier to assess cell type classification with the selected genes

In [None]:
import seaborn as sns

f, ax = plt.subplots(1,1,figsize=(15,4))
sns.countplot(data=adata.obs,
              x='cell_types_25',
              ax=ax,
              linewidth=.5)
bl = ax.bar_label(ax.containers[0], padding=5)
[b.set_rotation(90) for b in bl]
ax.tick_params(axis='x', labelrotation=90)
ax.set(xlabel='Cell types', ylabel='Count')
sns.despine(ax=ax)
plt.show()