In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
from joblib import Parallel, delayed

# Function to score genes for a subset of data
def score_genes_for_subset(sample_id, cell_type, adata, gene_list, sample_col, celltype_col, disease_col):
    # Subset the data for the current biosample and cell type
    subset_adata = adata[(adata.obs[sample_col] == sample_id) & 
                         (adata.obs[celltype_col] == cell_type)].copy()  # Make a copy to avoid view warnings
    
    # Only proceed if there are cells in the subset
    if subset_adata.n_obs > 0:
        # Score the gene set using scanpy's score_genes function
        sc.tl.score_genes(subset_adata, gene_list=gene_list, score_name='score')
        
        # Extract the score and calculate the mean score for the subset
        mean_score = subset_adata.obs['score'].mean()

        # Get the disease condition (or any other metadata column) for the current biosample
        disease_condition = subset_adata.obs[disease_col].unique()[0]  # Assuming all cells in the subset have the same disease condition
        
        # Return the result as a dictionary
        return {
            'sample_id': sample_id,
            'cell_type': cell_type,
            'mean_score': mean_score,
            'disease': disease_condition
        }
    else:
        return None

In [None]:
adata = ...

In [32]:
# Get unique combinations of sample_id and cell_type
sample_ids = adata.obs['sample_id'].unique()
cell_types = adata.obs['manual_celltype_annotation'].unique()

In [None]:
# Use parallel processing to score the gene set for each cell type in each sample
def score_genes_for_all_samples(adata, gene_list, sample_col='sample_id', celltype_col='manual_celltype_annotation', disease_col='disease'):
    # Get unique combinations of sample_id and cell_type
    sample_ids = adata.obs[sample_col].unique()
    cell_types = adata.obs[celltype_col].unique()

    # Use joblib to parallelize the scoring process
    results = Parallel(n_jobs=-1)(  # Use all available cores
        delayed(score_genes_for_subset)(sample_id, cell_type, adata, gene_list, sample_col, celltype_col, disease_col)
        for sample_id in sample_ids
        for cell_type in cell_types
    )

    # Filter out None results (for cases where there were no cells in the subset)
    results = [res for res in results if res is not None]

    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [36]:
results_df = score_genes_for_all_samples(adata, 
                                         gene_list, 
                                         sample_col='sample_id', 
                                         celltype_col='manual_celltype_annotation',
                                         disease_col='Actc1')

       'Gm14276', 'Gm18706', 'Gm30476', 'Gm30539', 'Gm31616', 'Gm35546',
       'Gm36582', 'Gm38456', 'Gm39529', 'Gm6013', 'Gm6545', 'Kics2',
       'Lamtor3-ps', 'Lhfpl6', 'Marchf3', 'Myef2l', 'Nherf4', 'Niban1',
       'Nlrp5-ps', 'Nopchap1', 'Or2v2', 'Or51r1', 'Or9a7', 'Ptprv',
       'Rpl26-ps6', 'Rpl9-ps8', 'Rps27a-ps1', 'Septin11', 'Septin5', 'Septin6',
       'Septin8', 'Septin9', 'Slc66a3', 'Snord96a', 'Styxl2', 'Tasl'],
      dtype='object')
       'Gm14276', 'Gm18706', 'Gm30476', 'Gm30539', 'Gm31616', 'Gm35546',
       'Gm36582', 'Gm38456', 'Gm39529', 'Gm6013', 'Gm6545', 'Kics2',
       'Lamtor3-ps', 'Lhfpl6', 'Marchf3', 'Myef2l', 'Nherf4', 'Niban1',
       'Nlrp5-ps', 'Nopchap1', 'Or2v2', 'Or51r1', 'Or9a7', 'Ptprv',
       'Rpl26-ps6', 'Rpl9-ps8', 'Rps27a-ps1', 'Septin11', 'Septin5', 'Septin6',
       'Septin8', 'Septin9', 'Slc66a3', 'Snord96a', 'Styxl2', 'Tasl'],
      dtype='object')
       'Gm14276', 'Gm18706', 'Gm30476', 'Gm30539', 'Gm31616', 'Gm35546',
       'Gm36582',