In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from math import ceil

# First part of this notebook:
# Goal of this dataset is to merge the cleaned/parsed matrix data (generated in convert_mat and concat_mat) with the stimulation data
# from fig6 into one anndata object

# Second part of this notebook:
# Extracts active cells

## First part of notebook
Labeling stimulation status

In [2]:
adata = ad.read_h5ad('/u/scratch/s/schhina/all_t_cell_data.h5ad')

  utils.warn_names_duplicates("obs")


In [3]:
adata

AnnData object with n_obs × n_vars = 63861 × 60725

In [4]:
act_data_path = '/u/scratch/s/schhina/41467_2019_12464_MOESM9_ESM.xlsx'

def get_stimulation():
    """
    Parse the T Cell figure data for a mapping between cell barcodes and their stimulation status and cd status, also remove duplicate barcodes

    Returns:
    - barcode_acts: Dictionary mapping barcodes to stimulation_status
    - barcode_cds:  Dictionary mapping barcodes to cd4 or cd8 status
    - dups:         Set containing all barcodes found more than once in dataset
    """
    barcode_acts = {}
    dups = set()
    barcode_cds = {}
    
    df = pd.read_excel(act_data_path, sheet_name="Fig6")
    i = 0
    for barcode, act, cd in zip(df['barcode'], df['stimulation_status'], df['cd4cd8_status']):
        if cd == 'unassigned':
            continue
            
        if barcode in barcode_acts:
            dups.add(barcode)
            del barcode_acts[barcode]
            del barcode_cds[barcode]
        elif barcode not in dups:
            barcode_acts[barcode] = act
            barcode_cds[barcode] = cd

        i += 1
    return barcode_acts, barcode_cds, dups

In [5]:
acts, cds, dups = get_stimulation()

In [6]:
len(acts)

47726

In [7]:
seen = set()
def add(bc, i):
    seen.add(bc)
    return i

In [8]:
# Trim duplicates
valid_inds = np.array([add(adata.obs.iloc[x].name, x) for x in np.arange(adata.n_obs) if (adata.obs.iloc[x].name not in dups and adata.obs.iloc[x].name in acts and adata.obs.iloc[x].name not in seen)])
adata = ad.AnnData(adata.to_df().iloc[valid_inds, :])

In [9]:
# Label stimulation value
adata.obs['stimulation'] = [acts[bc] for bc, v in adata.obs.iterrows()]

In [10]:
# Label cd status
adata.obs['cd_status'] = [cds[bc] for bc, v in adata.obs.iterrows()]

In [12]:
adata.write("/u/scratch/s/schhina/labeled_t_cell_data.h5ad")

In [13]:
adata

AnnData object with n_obs × n_vars = 47726 × 58828
    obs: 'stimulation', 'cd_status'

## Second part of notebook
Extract only active cells

In [11]:
# Get only active cells
valid_inds = np.array([i for i in np.arange(adata.n_obs) if acts[adata.obs.iloc[i].name] == 'act'])
active_adata = ad.AnnData(adata.to_df().iloc[valid_inds, :])

In [12]:
active_adata

AnnData object with n_obs × n_vars = 24761 × 58828

In [13]:
active_adata.write("/u/scratch/s/schhina/labeled_active_t_cell_data.h5ad")

NameError: name 'active_adata' is not defined