In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from math import ceil

In [2]:
adata = ad.read_h5ad('/u/scratch/s/schhina/all_t_cell_data.h5ad')

  utils.warn_names_duplicates("obs")


In [3]:
adata

AnnData object with n_obs × n_vars = 63861 × 60725

In [4]:
act_data_path = '/u/scratch/s/schhina/41467_2019_12464_MOESM9_ESM.xlsx'

def get_stimulation():
    barcode_acts = {}
    dups = set()
    barcode_cds = {}
    
    df = pd.read_excel(act_data_path, sheet_name="Fig6")
    i = 0
    for barcode, act, cd in zip(df['barcode'], df['stimulation_status'], df['cd4cd8_status']):
        if cd == 'unassigned':
            continue
            
        if barcode in barcode_acts:
            dups.add(barcode)
            del barcode_acts[barcode]
            del barcode_cds[barcode]
        elif barcode not in dups:
            barcode_acts[barcode] = act
            barcode_cds[barcode] = cd

        i += 1
    return barcode_acts, barcode_cds, dups

In [5]:
acts, cds, dups = get_stimulation()

In [6]:
len(acts)

47726

In [7]:
seen = set()
def add(bc, i):
    seen.add(bc)
    return i

In [8]:
# Trim duplicates
valid_inds = np.array([add(adata.obs.iloc[x].name, x) for x in np.arange(adata.n_obs) if (adata.obs.iloc[x].name not in dups and adata.obs.iloc[x].name in acts and adata.obs.iloc[x].name not in seen)])
adata = ad.AnnData(adata.to_df().iloc[valid_inds, :])

In [9]:
# Label obs
adata.obs['stimulation'] = [acts[bc] for bc, v in adata.obs.iterrows()]

In [10]:
adata.obs['cd_status'] = [cds[bc] for bc, v in adata.obs.iterrows()]

In [11]:
adata.obs['cd_status']

ACCCACTTCTCGATGA    CD8
GATCAGTTCAAACCGT    CD4
TGACTTTCAATGGATA    CD8
CAAGATCTCTTGGGTA    CD8
CATCCACGTCACCTAA    CD8
                   ... 
GCGCCAACACCGCTAG    CD8
CCTACACGTCTCCCTA    CD4
TCGCGAGCAGGAATCG    CD4
GGGAGATTCACCCGAG    CD8
GTCAAGTAGTCATGCT    CD4
Name: cd_status, Length: 47726, dtype: object

In [12]:
adata.write("/u/scratch/s/schhina/labeled_t_cell_data.h5ad")

In [13]:
adata

AnnData object with n_obs × n_vars = 47726 × 58828
    obs: 'stimulation', 'cd_status'

In [11]:
# Get only active cells
valid_inds = np.array([i for i in np.arange(adata.n_obs) if acts[adata.obs.iloc[i].name] == 'act'])
active_adata = ad.AnnData(adata.to_df().iloc[valid_inds, :])

In [12]:
active_adata

AnnData object with n_obs × n_vars = 24761 × 58828

In [13]:
active_adata.write("/u/scratch/s/schhina/labeled_active_t_cell_data.h5ad")

NameError: name 'active_adata' is not defined

In [2]:
adata = ad.read_h5ad("/u/scratch/s/schhina/labeled_t_cell_data.h5ad")

  utils.warn_names_duplicates("obs")


In [23]:
adata.to_df().head()

Accession,ENSG00000180346.3,ENSG00000185800.11,ENSG00000255389.1,ENSG00000147059.8,ENSG00000238045.9,ENSG00000056972.18,ENSG00000198920.9,ENSG00000213937.3,ENSG00000244113.1,ENSG00000265720.1,...,ENSG00000120992.17,ENSG00000167283.7,ENSG00000253676.1,ENSG00000206527.9,ENSG00000224698.1,ENSG00000236930.1,ENSG00000258227.6,ENSG00000122958.14,ENSG00000232431.3,ENSG00000259031.1
ACCCACTTCTCGATGA,0,0,0,0,1,0,0,0,0,0,...,0,9,0,1,0,0,0,1,0,0
GATCAGTTCAAACCGT,0,0,0,0,0,0,0,0,0,0,...,0,11,0,0,0,0,0,0,0,0
TGACTTTCAATGGATA,0,0,0,0,0,0,0,0,0,0,...,3,6,0,0,0,0,0,0,0,0
CAAGATCTCTTGGGTA,0,0,0,0,0,0,0,0,0,0,...,1,11,0,0,0,0,0,0,0,0
CATCCACGTCACCTAA,0,0,0,0,0,0,0,0,0,0,...,1,5,0,0,0,0,0,0,0,0


In [7]:
accessions = [adata.var.iloc[i].name for i in range(adata.n_vars)]

In [9]:
len(accessions)

60725

In [10]:
def remove_char(s):
    for i in range(len(s)):
        if s[i].isnumeric():
            return s[i:]
    return s[4:]

In [11]:
accs = [remove_char(s.split('.')[0]) for s in accessions]

In [16]:
inds = np.arange(len(accs))
np.random.shuffle(inds)

In [17]:
cutoff = ceil(len(inds)*0.8)
train_inds = inds[:cutoff]
test_inds  = inds[cutoff:]

In [18]:
len(train_inds)

48580

In [19]:
len(test_inds)

12145

In [22]:
np.save("train_inds.npy", train_inds)
np.save("test_inds.npy", test_inds)