In [1]:
import scanpy as sc

# Load in backed mode (read-only access to .X and some attributes)
adata = sc.read_h5ad("../data/tahoe_data/plate9_annData.h5ad", backed='r')


In [2]:
adata

AnnData object with n_obs × n_vars = 5866669 × 62710 backed at '/home/nilabjab/cancer_dependency_project_nilabja/cancer_dependency_project_sreeram/tahoe_100M/automatic_analysis/plate9_annData.h5ad'
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate'

In [3]:
# Check the shape (cells x genes)
print(adata.shape)

# Access gene names
print(adata.var_names[:5])

# Access cell metadata
print(adata.obs.head())

(5866669, 62710)
Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112'], dtype='object', name='gene_name')
                       sample  gene_count  tscp_count  mread_count  \
BARCODE_SUB_LIB_ID                                                   
01_001_019-lib_1585  smp_2263        1079        1464         1725   
01_001_031-lib_1585  smp_2263        1122        1476         1771   
01_001_132-lib_1585  smp_2263         977        1335         1577   
01_001_133-lib_1585  smp_2263        2022        3251         3869   
01_001_159-lib_1585  smp_2263        1178        1560         1823   

                                                     drugname_drugconc  \
BARCODE_SUB_LIB_ID                                                       
01_001_019-lib_1585  [('Sivelestat (sodium tetrahydrate)', 5.0, 'uM')]   
01_001_031-lib_1585  [('Sivelestat (sodium tetrahydrate)', 5.0, 'uM')]   
01_001_132-lib_1585  [('Sivelestat (sodium tetrahydrate)', 5.0, 'uM')]   
01_001_133-lib_1585  [('Sivelesta

In [4]:
# Filter for Erlotinib-treated cells
erlotinib_barcodes = adata.obs[adata.obs["drug"] == "Erlotinib"].index.tolist()

# Optional: check a few
print(f"Found {len(erlotinib_barcodes)} Erlotinib-treated cells")
print(erlotinib_barcodes[:5])

Found 47066 Erlotinib-treated cells
['28_001_020-lib_1585', '28_001_022-lib_1585', '28_001_104-lib_1585', '28_001_162-lib_1585', '28_001_176-lib_1585']


In [5]:
adata_DMSO = adata[adata.obs["drug"] == "DMSO_TF"]

In [6]:
adata_DMSO

View of AnnData object with n_obs × n_vars = 132211 × 62710 backed at '/home/nilabjab/cancer_dependency_project_nilabja/cancer_dependency_project_sreeram/tahoe_100M/automatic_analysis/plate9_annData.h5ad'
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate'

In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Densify the matrix (🔥 full speed mode)
adata_dense = adata_DMSO.to_memory()
X = adata_dense.X.toarray()
cell_names = adata_dense.obs["cell_name"].values
gene_names = adata_dense.var_names

# Map to store aggregated counts
pseudobulk_dict = {}

# Build list of unique cell lines
unique_cells = np.unique(cell_names)

# Loop with progress bar and aggregate
for cell in tqdm(unique_cells, desc="Aggregating pseudobulk"):
    idx = np.where(cell_names == cell)[0]
    pseudobulk_expr = X[idx].sum(axis=0)
    pseudobulk_dict[cell] = pseudobulk_expr

# Convert to DataFrame
pseudobulk_df = pd.DataFrame.from_dict(pseudobulk_dict, orient="index", columns=gene_names)

# Final check
print("✅ Pseudobulk complete!")
print("Shape:", pseudobulk_df.shape)
print(pseudobulk_df.head())


Aggregating pseudobulk: 100%|██████████| 50/50 [00:06<00:00,  7.23it/s]


✅ Pseudobulk complete!
Shape: (50, 62710)
gene_name  TSPAN6  TNMD    DPM1  SCYL3  C1orf112  FGR    CFH  FUCA2    GCLC  \
A-172         5.0   0.0   305.0   47.0      98.0  2.0   13.0   95.0   126.0   
A-427        51.0   0.0   600.0   71.0     173.0  0.0    5.0  244.0   243.0   
A498         88.0   0.0  1799.0   90.0     206.0  0.0  705.0  433.0  1068.0   
A549         43.0   0.0   477.0   27.0      98.0  1.0   27.0  125.0   482.0   
AN3 CA       21.0   0.0    52.0    3.0      13.0  0.0    3.0   36.0    74.0   

gene_name   NFYA  ...  POLGARF  ENSG00000291308  LY6S  ENSG00000291310  \
A-172      144.0  ...      2.0              0.0   0.0              0.0   
A-427      311.0  ...      2.0              0.0   0.0              0.0   
A498       457.0  ...      2.0              0.0   1.0              0.0   
A549       159.0  ...      2.0              0.0   0.0              0.0   
AN3 CA      71.0  ...      1.0              0.0   0.0              0.0   

gene_name  ENSG00000291312  ENSG000002

In [8]:
pseudobulk_df.to_csv('../results/tahoe_dmso_pb.csv', index=True)