# Create an AnnData Object from Tahoe-100M Dataset for the A549 cell line


In [5]:
from datasets import load_dataset
from scipy.sparse import csr_matrix
import anndata
import pandas as pd
import pubchempy as pcp

## Mapping records to anndata

This function takes in a generator that emits records from the Tahoe-100M huggingface dataset and returns an anndata object. Use the sample_size argument to specify the number of records you need. You can also create a new generator using the dataset.filter function to only emit records that match a certain filter (eg: for a specific drug/plate/sample).

If you'd like to create a DataLoader for an ML training application, it's likely best to use the data in it's native format without interfacing with anndata.

In [39]:
import pandas as pd
import anndata
from scipy.sparse import csr_matrix
from datasets import Dataset, IterableDataset
from typing import List, Dict, Any, Union

def create_anndata_with_metadata_lookup(
    hf_dataset: Union[Dataset, IterableDataset], 
    gene_metadata_ds: Dataset,
    cell_line_metadata_ds: Dataset,
    sample_metadata_ds: Dataset,
    drug_metadata_ds: Dataset, # Included but not used in A549 filter
    filter_criteria: Dict[str, Any] = None,
    sample_size: int = None
) -> anndata.AnnData:
    
    # --- 1. Prepare Gene Vocabulary and Metadata Lookups ---
    
    # Gene Vocabulary (.var) - Assumed correct
    gene_metadata_df = gene_metadata_ds.to_pandas().set_index('token_id')
    sorted_vocab_items = sorted(gene_metadata_df.index.tolist())
    token_ids = sorted_vocab_items
    gene_names = gene_metadata_df.loc[token_ids, 'gene_symbol'].fillna(
        gene_metadata_df.loc[token_ids, 'ensembl_id']).tolist()
    token_id_to_col_idx = {token_id: idx for idx, token_id in enumerate(token_ids)}

    # Convert linked metadata to DataFrames for easy lookup
    
    # Use 'Cell_ID_DepMap' as the index for cell_line_df
    cell_line_df = cell_line_metadata_ds.to_pandas().set_index('Cell_ID_DepMap') 
    cell_line_df.index.name = 'cell_line_id' # Standardize the index name for clarity
    
    # Use 'sample' for sample_metadata index
    sample_df = sample_metadata_ds.to_pandas().set_index('sample') 
    
    # --- 2. Determine Required Sample IDs from Filter Criteria ---
    
    required_sample_ids = None
    
    if filter_criteria:
        # Check Cell Line Filter (e.g., {"cell_name": "A549"})
        cell_line_filter = {k: v for k, v in filter_criteria.items() if k in cell_line_df.columns}
        if cell_line_filter:
            # 1. Find the DepMap IDs (index of cell_line_df) that match the filter (e.g., cell_name == 'A549')
            matching_cell_line_ids = cell_line_df.query(
                " & ".join([f"{k} == '{v}'" for k, v in cell_line_filter.items()])
            ).index.tolist()
            
            # 2. Find all sample IDs (index of sample_df) linked to those DepMap IDs
            # ðŸ”‘ FIX 3: Query the sample_df using the 'cell_line_id' column, which holds the DepMap ID
            required_sample_ids = set(
                sample_df[sample_df['Cell_ID_DepMap'].isin(matching_cell_line_ids)].index.tolist()
            )

        # Check Sample Filter (e.g., {"perturbation_name": "AKT_inhibitor"})
        sample_filter = {k: v for k, v in filter_criteria.items() if k in sample_df.columns}
        if sample_filter:
            matching_sample_ids = set(
                sample_df.query(
                    " & ".join([f"{k} == '{v}'" for k, v in sample_filter.items()])
                ).index.tolist()
            )
            
            # Combine filters
            if required_sample_ids is not None:
                required_sample_ids.intersection_update(matching_sample_ids)
            else:
                required_sample_ids = matching_sample_ids
    
    # --- 3. Iterate and Build Sparse Matrix ---
    
    data, indices, indptr = [], [], [0]
    cell_barcodes = []
    generator = hf_dataset.to_iterable_dataset()
    
    cell_line_meta_list: List[Dict[str, Any]] = [] 
    sample_meta_list: List[Dict[str, Any]] = [] 
    num_cells = 0

    for cell in generator:
        if sample_size is not None and num_cells >= sample_size:
            break
            
        sample_id = cell['sample_id']
        
        # Filtering Step
        if required_sample_ids is None or sample_id in required_sample_ids:
            
            genes = cell['indices']
            expressions = cell['expressions']
            
            # Collect sparse matrix data (logic retained)
            col_indices = [token_id_to_col_idx.get(gene) for gene in genes]
            valid_col_indices = [idx for idx in col_indices if idx is not None]
            valid_expressions = [expr for expr, idx in zip(expressions, col_indices) if idx is not None]
            
            data.extend(valid_expressions)
            indices.extend(valid_col_indices)
            indptr.append(len(data))
            
            cell_barcodes.append(cell['cell_barcode'])

            # ðŸ”‘ Store the linked metadata based on IDs
            # Get the DepMap ID (the index key for cell_line_df) from the sample_df
            cell_line_depmap_id = sample_df.loc[sample_id, 'cell_line_id']
            
            cell_line_meta_list.append(cell_line_df.loc[cell_line_depmap_id].to_dict())
            sample_meta_list.append(sample_df.loc[sample_id].to_dict())
            
            num_cells += 1

    # --- 4. Assemble and Store AnnData Components ---
    num_cells_final = len(indptr) - 1
    expr_matrix = csr_matrix((data, indices, indptr), shape=(num_cells_final, len(gene_names)))
    
    obs_df = pd.DataFrame(index=pd.Index(cell_barcodes, name='cell_barcode'))

    adata = anndata.AnnData(X=expr_matrix, obs=obs_df)
    
    # --- 5. Store linked DataFrames in .uns ---
    cell_line_uns_df = pd.DataFrame(cell_line_meta_list, index=obs_df.index)
    adata.uns['cell_line_metadata_df'] = cell_line_uns_df

    sample_uns_df = pd.DataFrame(sample_meta_list, index=obs_df.index)
    adata.uns['sample_metadata_df'] = sample_uns_df
    
    adata.var = gene_metadata_df.set_index('gene_symbol').loc[gene_names]
    adata.var.index.name = 'gene_symbol' 

    return adata

## Load Tahoe-100M Dataset


In [40]:
# 1. Load the metadata tables (as defined in your previous prompt)
# Note: These lines assume successful loading and are for context.
sample_metadata = load_dataset("tahoebio/Tahoe-100M", "sample_metadata", split="train")
gene_metadata = load_dataset("tahoebio/Tahoe-100M", "gene_metadata", split="train")
drug_metadata = load_dataset("tahoebio/Tahoe-100M", "drug_metadata", split="train")
cell_line_metadata = load_dataset("tahoebio/Tahoe-100M", "cell_line_metadata", split="train")
tahoe_100m_ds = load_dataset("tahoebio/Tahoe-100M", streaming=True, split="train")

# --- 2. Define the Filter Criteria ---
# The filter key must match the column name in the cell_line_metadata table.
# The value is the specific cell line name you want.

filter_a549 = {
    "cell_name": "A549" 
}

# --- 3. Call the Function ---
# Pass the filter criteria to the function. The function will look up the 
# 'A549' DepMap ID from the cell_line_metadata and use it to filter the stream.

a549_adata = create_anndata_with_metadata_lookup(
    hf_dataset=tahoe_100m_ds, 
    gene_metadata_ds=gene_metadata,
    cell_line_metadata_ds=cell_line_metadata,
    sample_metadata_ds=sample_metadata,
    drug_metadata_ds=drug_metadata,
    filter_criteria=filter_a549, 
    sample_size=1000 # Set this to a small number (e.g., 5000) for testing, or None for all
)

print(f"AnnData object created with {a549_adata.n_obs} cells filtered by cell_name='A549'.")

KeyError: 'Cell_ID_DepMap'

## Load Gene Metadata

The gene metadata contains the mapping between the integer token IDs used in the dataset and standard identifiers for genes (ensembl IDs and HGNC gene symbols)

In [8]:
gene_metadata = load_dataset("vevotx/Tahoe-100M", name="gene_metadata", split="train")
gene_vocab = {entry["token_id"]: entry["ensembl_id"] for entry in gene_metadata}

In [16]:
gene_vocab

{3: 'ENSG00000000003',
 4: 'ENSG00000000005',
 5: 'ENSG00000000419',
 6: 'ENSG00000000457',
 7: 'ENSG00000000460',
 8: 'ENSG00000000938',
 9: 'ENSG00000000971',
 10: 'ENSG00000001036',
 11: 'ENSG00000001084',
 12: 'ENSG00000001167',
 13: 'ENSG00000001460',
 14: 'ENSG00000001461',
 15: 'ENSG00000001497',
 16: 'ENSG00000001561',
 17: 'ENSG00000001617',
 18: 'ENSG00000001626',
 19: 'ENSG00000001629',
 20: 'ENSG00000001630',
 21: 'ENSG00000001631',
 22: 'ENSG00000002016',
 23: 'ENSG00000002079',
 24: 'ENSG00000002330',
 25: 'ENSG00000002549',
 26: 'ENSG00000002586',
 27: 'ENSG00000002587',
 28: 'ENSG00000002726',
 29: 'ENSG00000002745',
 30: 'ENSG00000002746',
 31: 'ENSG00000002822',
 32: 'ENSG00000002834',
 33: 'ENSG00000002919',
 34: 'ENSG00000002933',
 35: 'ENSG00000003056',
 36: 'ENSG00000003096',
 37: 'ENSG00000003137',
 38: 'ENSG00000003147',
 39: 'ENSG00000003249',
 40: 'ENSG00000003393',
 41: 'ENSG00000003400',
 42: 'ENSG00000003402',
 43: 'ENSG00000003436',
 44: 'ENSG00000003509',

In [10]:
adata = create_anndata_from_generator(tahoe_100m_ds, gene_vocab, sample_size=1000)
adata



AnnData object with n_obs Ã— n_vars = 1000 Ã— 62710
    obs: 'drug', 'sample', 'BARCODE_SUB_LIB_ID', 'cell_line_id', 'moa-fine', 'canonical_smiles', 'pubchem_cid', 'plate'

## Inspect Metadata (adata.obs)

In [11]:
adata.obs.head()

Unnamed: 0,drug,sample,BARCODE_SUB_LIB_ID,cell_line_id,moa-fine,canonical_smiles,pubchem_cid,plate
0,8-Hydroxyquinoline,smp_1783,01_001_052-lib_1105,CVCL_0480,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4
1,8-Hydroxyquinoline,smp_1783,01_001_105-lib_1105,CVCL_0546,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4
2,8-Hydroxyquinoline,smp_1783,01_001_165-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4
3,8-Hydroxyquinoline,smp_1783,01_003_094-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4
4,8-Hydroxyquinoline,smp_1783,01_003_164-lib_1105,CVCL_1056,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4


## Enrich with Sample Metadata

Although the main data contains several metadata fields, there are some additional columns (such as drug concentration) which are omitted to reduce the size of the data. If they are needed, they may be fetched using the sample_metadata.

In [12]:
sample_metadata = load_dataset("vevotx/Tahoe-100M","sample_metadata", split="train").to_pandas()
adata.obs = pd.merge(adata.obs, sample_metadata.drop(columns=["drug","plate"]), on="sample")
adata.obs.head()

Unnamed: 0,drug,sample,BARCODE_SUB_LIB_ID,cell_line_id,moa-fine,canonical_smiles,pubchem_cid,plate,mean_gene_count,mean_tscp_count,mean_mread_count,mean_pcnt_mito,drugname_drugconc
0,8-Hydroxyquinoline,smp_1783,01_001_052-lib_1105,CVCL_0480,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"
1,8-Hydroxyquinoline,smp_1783,01_001_105-lib_1105,CVCL_0546,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"
2,8-Hydroxyquinoline,smp_1783,01_001_165-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"
3,8-Hydroxyquinoline,smp_1783,01_003_094-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"
4,8-Hydroxyquinoline,smp_1783,01_003_164-lib_1105,CVCL_1056,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]"


## Add Drug Metadata

The drug metadata contains additional information for the compounds used in Tahoe-100M. See the dataset card and our paper for more information about how this information was generated.

In [13]:
drug_metadata = load_dataset("vevotx/Tahoe-100M","drug_metadata", split="train").to_pandas()
adata.obs = pd.merge(adata.obs, drug_metadata.drop(columns=["canonical_smiles","pubchem_cid","moa-fine"]), on="drug")
adata.obs.head()

Unnamed: 0,drug,sample,BARCODE_SUB_LIB_ID,cell_line_id,moa-fine,canonical_smiles,pubchem_cid,plate,mean_gene_count,mean_tscp_count,mean_mread_count,mean_pcnt_mito,drugname_drugconc,targets,moa-broad,human-approved,clinical-trials,gpt-notes-approval
0,8-Hydroxyquinoline,smp_1783,01_001_052-lib_1105,CVCL_0480,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."
1,8-Hydroxyquinoline,smp_1783,01_001_105-lib_1105,CVCL_0546,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."
2,8-Hydroxyquinoline,smp_1783,01_001_165-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."
3,8-Hydroxyquinoline,smp_1783,01_003_094-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."
4,8-Hydroxyquinoline,smp_1783,01_003_164-lib_1105,CVCL_1056,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",,unclear,no,yes,"Used in some clinical trial formulations, not ..."


## Drug Info from PubChem

We also provide the pubchem IDs for the compounds in Tahoe, this can be used to querry additional information as needed.

In [14]:
drug_name = adata.obs["drug"].values[0]
cid = int(float(adata.obs["pubchem_cid"].values[0]))
compound = pcp.Compound.from_cid(cid)

print(f"Name: {drug_name}")
print(f"Synonyms: {compound.synonyms[:10]}")
print(f"Formula: {compound.molecular_formula}")
print(f"SMILES: {compound.isomeric_smiles}")
print(f"Mass: {compound.exact_mass}")

Name: 8-Hydroxyquinoline
Synonyms: ['8-HYDROXYQUINOLINE', 'quinolin-8-ol', '148-24-3', '8-quinolinol', 'Oxyquinoline', 'Oxine', 'Quinophenol', 'Oxychinolin', 'Phenopyridine', '8-Quinol']
Formula: C9H7NO
SMILES: None
Mass: 145.052763847
Synonyms: ['8-HYDROXYQUINOLINE', 'quinolin-8-ol', '148-24-3', '8-quinolinol', 'Oxyquinoline', 'Oxine', 'Quinophenol', 'Oxychinolin', 'Phenopyridine', '8-Quinol']
Formula: C9H7NO
SMILES: None
Mass: 145.052763847


# Load Cell Line Metadata

The cell-line metadata contains additional identifiers for the cell-lines used in Tahoe (eg: Depmap-IDs) as well as a curated list of driver mutations for each cell line. This information can be used for instance to train genotype aware models on the Tahoe data.

In [None]:
cell_line_metadata = load_dataset("vevotx/Tahoe-100M","cell_line_metadata", split="train").to_pandas()
cell_line_metadata.head()