In [1]:
import os
import subprocess as sp

import numpy as np
import pandas as pd
import scanpy as sc
from scipy.sparse import csr_matrix
from tqdm import tqdm

In [2]:
def load_or_download_norman19(data_cache_dir="./data/norman19"):
    data_url = "https://zenodo.org/records/7041849/files/NormanWeissman2019_filtered.h5ad?download=1"

    if not os.path.exists(data_cache_dir):
        os.makedirs(data_cache_dir)

    tmp_data_dir = f"{data_cache_dir}/norman19_downloaded.h5ad"

    # Download if raw data doesn't exist
    if not os.path.exists(tmp_data_dir):
        print(f"Downloading data from {data_url}")
        sp.call(f"wget -q {data_url} -O {tmp_data_dir}", shell=True)

    print(f"Loading data from {tmp_data_dir}")
    adata = sc.read_h5ad(tmp_data_dir)
    return adata

In [3]:
adata = load_or_download_norman19()

Loading data from ./data/norman19/norman19_downloaded.h5ad


# PREPROCESS

In [4]:
adata

AnnData object with n_obs × n_vars = 111445 × 33694
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo'
    var: 'ensemble_id', 'ncounts', 'ncells'

In [5]:
adata.obs.columns

Index(['guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup',
       'good_coverage', 'number_of_cells', 'tissue_type', 'cell_line',
       'cancer', 'disease', 'perturbation_type', 'celltype', 'organism',
       'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito',
       'percent_ribo'],
      dtype='object')

In [6]:
adata.obs.rename(
    columns={
        "nCount_RNA": "ncounts",
        "nFeature_RNA": "ngenes",
        "percent.mt": "percent_mito",
        "cell_line": "cell_type",
    },
    inplace=True,
)
adata.obs["perturbation"] = adata.obs["perturbation"].str.replace("_", "+")
adata.obs["perturbation"] = adata.obs["perturbation"].astype("category")
adata.obs["condition"] = adata.obs.perturbation.copy()
adata.X = csr_matrix(adata.X)

In [7]:
adata

AnnData object with n_obs × n_vars = 111445 × 33694
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_type', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo', 'condition'
    var: 'ensemble_id', 'ncounts', 'ncells'

In [8]:
adata.obs.columns

Index(['guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup',
       'good_coverage', 'number_of_cells', 'tissue_type', 'cell_type',
       'cancer', 'disease', 'perturbation_type', 'celltype', 'organism',
       'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito',
       'percent_ribo', 'condition'],
      dtype='object')

In [9]:
# FILTER CELLS

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [10]:
adata

AnnData object with n_obs × n_vars = 111445 × 22608
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_type', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo', 'condition', 'n_genes'
    var: 'ensemble_id', 'ncounts', 'ncells', 'n_cells'

In [11]:
adata.layers["counts"] = adata.X.copy()

In [12]:
# For every perturbation, for every gene, calculate the mean and variance of the counts
mean_df = pd.DataFrame(index=adata.var_names, columns=adata.obs["condition"].unique())
disp_df = pd.DataFrame(index=adata.var_names, columns=adata.obs["condition"].unique())
for pert in tqdm(adata.obs["condition"].unique()):
    pert_cells = adata.obs[adata.obs["condition"] == pert].index.tolist()
    pert_counts = adata[pert_cells].X.toarray()
    mean_df.loc[:, pert] = np.mean(pert_counts, axis=0)
    disp_df.loc[:, pert] = np.var(pert_counts, axis=0)

# Save to the uns dictionary
mean_df_dict = mean_df.to_dict(orient="list")
disp_df_dict = disp_df.to_dict(orient="list")
adata.uns["mean_dict"] = mean_df_dict
adata.uns["disp_dict"] = disp_df_dict
adata.uns["mean_disp_dict_genes"] = disp_df.index.tolist()

100%|██████████| 237/237 [00:05<00:00, 40.71it/s]


In [13]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [14]:
np.random.seed(0)
MAX_CELLS = 256
MAX_CELLS_CONTROL = 8192
pert_counts = adata.obs["condition"].value_counts()
pert_counts = pert_counts[pert_counts > MAX_CELLS]
cells_to_keep = []
for pert in pert_counts.index:
    pert_cells = adata.obs[adata.obs["condition"] == pert].index.tolist()
    if pert == "control":
        pert_cells = np.random.choice(pert_cells, size=MAX_CELLS_CONTROL, replace=False)
    else:
        pert_cells = np.random.choice(pert_cells, size=MAX_CELLS, replace=False)
    cells_to_keep.extend(pert_cells)

# Subset the adata object
adata = adata[cells_to_keep]

In [15]:
adata

View of AnnData object with n_obs × n_vars = 52992 × 22608
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_type', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo', 'condition', 'n_genes'
    var: 'ensemble_id', 'ncounts', 'ncells', 'n_cells'
    uns: 'mean_dict', 'disp_dict', 'mean_disp_dict_genes', 'log1p'
    layers: 'counts'

In [16]:
# Get 8192 HVGs -- subset the adata object to only include the HVGs
sc.pp.highly_variable_genes(adata, n_top_genes=8192, subset=True)

  adata.uns["hvg"] = {"flavor": flavor}


# Calculate DEGs as in the paper

In [17]:
print(adata.X)

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 47198615 stored elements and shape (52992, 8192)>
  Coords	Values
  (0, 49)	1.6189128160476685
  (0, 57)	0.962807834148407
  (0, 62)	0.962807834148407
  (0, 95)	0.593061625957489
  (0, 115)	0.593061625957489
  (0, 119)	0.962807834148407
  (0, 127)	2.3715736865997314
  (0, 132)	0.593061625957489
  (0, 136)	2.512301206588745
  (0, 139)	0.593061625957489
  (0, 161)	0.593061625957489
  (0, 170)	1.232140302658081
  (0, 173)	1.767657995223999
  (0, 186)	1.232140302658081
  (0, 190)	0.962807834148407
  (0, 195)	0.593061625957489
  (0, 196)	1.232140302658081
  (0, 206)	0.962807834148407
  (0, 208)	0.962807834148407
  (0, 217)	0.593061625957489
  (0, 225)	3.800288438796997
  (0, 227)	0.593061625957489
  (0, 231)	0.962807834148407
  (0, 241)	1.232140302658081
  (0, 251)	1.767657995223999
  :	:
  (52991, 8090)	0.3413776755332947
  (52991, 8094)	0.9660472869873047
  (52991, 8098)	0.3413776755332947
  (52991, 8115)	1.960554838180542
  (5

In [18]:
adata

AnnData object with n_obs × n_vars = 52992 × 8192
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_type', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo', 'condition', 'n_genes'
    var: 'ensemble_id', 'ncounts', 'ncells', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'mean_dict', 'disp_dict', 'mean_disp_dict_genes', 'log1p', 'hvg'
    layers: 'counts'

In [19]:
def compute_degs(adata, mode="vsrest", pval_threshold=0.05):
    """
    Compute differentially expressed genes (DEGs) for each perturbation.

    Args:
        adata: AnnData object with processed data
        mode: 'vsrest' or 'vscontrol'
            - 'vsrest': Compare each perturbation vs all other perturbations (excluding control)
            - 'vscontrol': Compare each perturbation vs control only
        pval_threshold: P-value threshold for significance (default: 0.05)

    Returns:
        dict: rank_genes_groups results dictionary

    Adds to adata.uns:
        - deg_dict_{mode}: Dictionary with perturbation as key and dict with 'up'/'down' DEGs as values
        - rank_genes_groups_{mode}: Full rank_genes_groups results
    """
    if mode == "vsrest":
        # Remove control cells for vsrest analysis
        adata_subset = adata[adata.obs["condition"] != "control"].copy()
        reference = "rest"
    elif mode == "vscontrol":
        # Use full dataset for vscontrol analysis
        adata_subset = adata.copy()
        reference = "control"
    else:
        raise ValueError("mode must be 'vsrest' or 'vscontrol'")

    # Compute DEGs
    sc.tl.rank_genes_groups(adata_subset, "condition", method="t-test_overestim_var", reference=reference)

    # Extract results
    names_df = pd.DataFrame(adata_subset.uns["rank_genes_groups"]["names"])
    pvals_adj_df = pd.DataFrame(adata_subset.uns["rank_genes_groups"]["pvals_adj"])
    logfc_df = pd.DataFrame(adata_subset.uns["rank_genes_groups"]["logfoldchanges"])

    # For each perturbation, get the significant DEGs up and down regulated
    deg_dict = {}
    for pert in tqdm(adata_subset.obs["condition"].unique(), desc=f"Computing DEGs {mode}"):
        if mode == "vscontrol" and pert == "control":
            continue  # Skip control when comparing vs control

        pert_degs = names_df[pert]
        pert_pvals = pvals_adj_df[pert]
        pert_logfc = logfc_df[pert]

        # Get significant DEGs
        significant_mask = pert_pvals < pval_threshold
        pert_degs_sig = pert_degs[significant_mask]
        pert_logfc_sig = pert_logfc[significant_mask]

        # Split into up and down regulated
        pert_degs_sig_up = pert_degs_sig[pert_logfc_sig > 0].tolist()
        pert_degs_sig_down = pert_degs_sig[pert_logfc_sig < 0].tolist()

        deg_dict[pert] = {"up": pert_degs_sig_up, "down": pert_degs_sig_down}

    # Save results to adata.uns
    adata.uns[f"deg_dict_{mode}"] = deg_dict
    adata.uns[f"rank_genes_groups_{mode}"] = adata_subset.uns["rank_genes_groups"].copy()

    return adata_subset.uns["rank_genes_groups"]

In [20]:
def calculate_weights(adata, mode="vsrest"):
    # 1. Compute DEGs
    compute_degs(adata, mode=mode, pval_threshold=0.05)

    # 2. Retrieve the results dynamically based on mode
    key = f"rank_genes_groups_{mode}"

    # scanpy returns 'names' and 'scores' as structured arrays
    # columns are groups (perturbations), rows are ranks (0 to n_genes)
    scores_df = pd.DataFrame(adata.uns[key]["scores"])
    names_df = pd.DataFrame(adata.uns[key]["names"])

    # 3. Apply transformations (Abs -> MinMax -> Square)
    abs_scores = scores_df.abs()

    # Min-max normalization per perturbation (axis=0 is default for min/max)
    # This scales everything to [0, 1]
    denom = abs_scores.max() - abs_scores.min()
    # Handle cases where max == min (e.g., all zeros) to avoid divide-by-zero
    denom[denom == 0] = 1.0

    norm_scores = (abs_scores - abs_scores.min()) / denom

    # Square the weights to accentuate differences [cite: 163]
    norm_scores = norm_scores**2

    # 4. Normalize to sum to 1 per perturbation [cite: 163]
    # This turns them into a probability distribution
    norm_scores = norm_scores / norm_scores.sum(axis=0)

    # 5. REINDEXING (Crucial Step)
    # We need to map these scores back to the actual gene order in adata.var_names
    # We iterate over columns because each column has a different gene order in 'names_df'
    aligned_weights = pd.DataFrame(index=adata.var_names, columns=norm_scores.columns)

    print("Aligning weights to gene list...")
    for pert in tqdm(norm_scores.columns):
        # Create a series: index=GeneName, value=Weight
        pert_weights = pd.Series(norm_scores[pert].values, index=names_df[pert].values)
        # Reindex to match the exact order of adata.var_names
        aligned_weights[pert] = pert_weights.reindex(adata.var_names).fillna(0)

    # 6. Return the aligned weights dataframe
    # Rows = Genes (aligned with adata.X), Columns = Perturbations
    return aligned_weights

In [21]:
a = calculate_weights(adata, mode="vsrest")

  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group

Aligning weights to gene list...


100%|██████████| 175/175 [00:00<00:00, 2546.90it/s]


In [22]:
b = calculate_weights(adata, mode="vscontrol")

  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group

Aligning weights to gene list...


100%|██████████| 175/175 [00:00<00:00, 2695.97it/s]


In [23]:
a

Unnamed: 0,AHR,AHR+FEV,AHR+KLF1,ARRDC3,ATL1,BAK1,BCL2L11,BCL2L11+TGFBR2,BCORL1,BPGM,...,UBASH3B+PTPN12,UBASH3B+PTPN9,UBASH3B+UBASH3A,ZBTB1,ZBTB10+PTPN12,ZBTB25,ZC3HAV1,ZC3HAV1+CEBPE,ZC3HAV1+HOXC13,ZNF318
RP11-34P13.3,8.526933e-06,2.298661e-06,4.129850e-06,0.000012,6.881904e-06,8.379162e-06,9.284104e-06,8.835398e-06,9.241544e-06,7.896461e-06,...,5.717726e-06,4.987768e-06,1.012097e-05,7.679796e-06,4.484755e-06,9.877135e-06,7.234437e-05,5.992180e-06,7.783971e-06,9.577107e-06
RP11-34P13.7,6.111642e-05,4.998302e-05,2.960052e-05,0.000089,4.932575e-05,6.005728e-05,1.898704e-05,1.292435e-04,6.623836e-05,2.086583e-04,...,4.098155e-05,3.574961e-05,9.240668e-05,6.130959e-06,3.214428e-05,1.536447e-05,9.581151e-06,1.288613e-07,5.579128e-05,6.864350e-05
RP11-34P13.8,2.148207e-06,5.791062e-07,1.040441e-06,0.000003,1.733772e-06,2.110979e-06,1.445466e-04,2.225919e-06,2.328240e-06,1.989371e-06,...,1.440478e-06,1.256578e-06,2.549796e-06,1.934786e-06,1.129853e-06,2.488366e-06,1.405935e-06,1.509621e-06,1.961031e-06,2.412780e-06
FO538757.2,5.020519e-06,6.951217e-05,4.434844e-05,0.000905,1.466553e-04,1.138203e-03,1.300191e-04,7.831908e-06,7.113353e-05,1.275870e-05,...,4.934236e-07,2.484918e-04,3.227707e-06,6.699137e-05,7.913893e-05,1.838055e-04,2.563463e-05,9.765943e-05,5.188992e-05,1.368634e-05
RP5-857K21.2,8.307711e-07,2.239564e-07,4.023674e-07,0.000001,6.704974e-07,8.163739e-07,9.045415e-07,8.608245e-07,9.003950e-07,7.693448e-07,...,5.570727e-07,4.859536e-07,9.860768e-07,7.482353e-07,4.369455e-07,9.623201e-07,5.437137e-07,5.838125e-07,7.583850e-07,9.330887e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND6,4.077774e-03,5.856526e-05,1.042083e-03,0.001902,7.461477e-04,4.919371e-04,2.631025e-04,2.778034e-04,1.268950e-04,5.682532e-05,...,2.644988e-04,1.351829e-03,1.698693e-04,2.173608e-03,8.289453e-04,1.210864e-04,5.034637e-04,8.192301e-06,2.688338e-05,1.852905e-05
MT-CYB,1.508314e-02,9.160457e-03,2.848004e-03,0.001512,4.724842e-04,1.196746e-05,1.737148e-04,3.184816e-05,1.998841e-07,5.741882e-03,...,3.384431e-04,6.148921e-05,1.492114e-03,2.720124e-03,2.044553e-04,1.852367e-03,4.992365e-04,4.878652e-04,4.677882e-04,1.519239e-03
AC136616.1,5.438801e-05,2.344530e-05,1.055520e-05,0.000079,2.241566e-05,5.344547e-05,5.921754e-05,5.635551e-05,2.542103e-05,5.036663e-05,...,3.646982e-05,3.181388e-05,6.455540e-05,4.122843e-05,1.310607e-05,6.300011e-05,3.559525e-05,3.822040e-05,2.343963e-05,1.723325e-05
AL354822.1,3.697992e-04,8.744058e-06,9.131129e-05,0.000043,1.202086e-04,2.076476e-05,1.162419e-05,7.537440e-05,1.595778e-04,2.616322e-05,...,1.144762e-04,1.289792e-05,1.961913e-04,4.870363e-04,2.160832e-05,1.568050e-05,3.141780e-05,1.468785e-05,4.468145e-05,1.972371e-06


In [24]:
b

Unnamed: 0,AHR,AHR+FEV,AHR+KLF1,ARRDC3,ATL1,BAK1,BCL2L11,BCL2L11+TGFBR2,BCORL1,BPGM,...,UBASH3B+PTPN12,UBASH3B+PTPN9,UBASH3B+UBASH3A,ZBTB1,ZBTB10+PTPN12,ZBTB25,ZC3HAV1,ZC3HAV1+CEBPE,ZC3HAV1+HOXC13,ZNF318
RP11-34P13.3,0.000005,1.529191e-06,0.000004,0.000013,0.000004,1.593935e-05,1.766864e-05,0.000014,0.000011,0.000006,...,3.213817e-06,0.000003,0.000006,4.613692e-06,2.556275e-06,0.000008,9.379919e-05,0.000006,0.000005,0.000011
RP11-34P13.7,0.000065,3.002594e-05,0.000052,0.000162,0.000047,2.022525e-04,1.300669e-05,0.000161,0.000140,0.000144,...,4.077974e-05,0.000035,0.000049,2.728198e-07,3.243626e-05,0.000003,5.187097e-06,0.000002,0.000061,0.000138
RP11-34P13.8,0.000003,8.526382e-07,0.000002,0.000007,0.000002,8.887379e-06,2.914075e-04,0.000008,0.000006,0.000003,...,1.791943e-06,0.000002,0.000004,2.572477e-06,1.425314e-06,0.000004,5.584428e-06,0.000003,0.000003,0.000006
FO538757.2,0.000057,1.770656e-05,0.000003,0.000534,0.000021,1.503579e-03,3.275422e-05,0.000181,0.000001,0.000094,...,2.581753e-05,0.000067,0.000066,1.366849e-06,1.172773e-04,0.000031,1.084957e-06,0.000017,0.000136,0.000020
RP5-857K21.2,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,...,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND6,0.001242,5.329586e-06,0.000332,0.000373,0.001243,1.304819e-10,1.058290e-04,0.000057,0.000151,0.000159,...,6.964248e-04,0.001730,0.000078,4.666366e-04,1.217498e-03,0.000147,4.209610e-05,0.000544,0.000429,0.000988
MT-CYB,0.012985,7.615157e-03,0.004957,0.000602,0.000823,2.811150e-04,7.643163e-07,0.000163,0.000421,0.007373,...,2.993697e-05,0.000007,0.000433,1.192505e-03,1.423087e-05,0.000788,2.571579e-04,0.000123,0.000896,0.000851
AC136616.1,0.000032,1.740934e-05,0.000015,0.000080,0.000016,1.004018e-04,1.112946e-04,0.000085,0.000043,0.000038,...,2.024380e-05,0.000018,0.000040,3.071042e-05,1.032205e-05,0.000050,6.308797e-05,0.000039,0.000020,0.000030
AL354822.1,0.000105,2.613938e-05,0.000025,0.000004,0.000159,1.418627e-05,3.110095e-05,0.000003,0.000041,0.000001,...,1.567342e-04,0.000048,0.000306,5.107800e-04,2.095842e-09,0.000011,4.055090e-14,0.000111,0.000109,0.000045


In [34]:
a["AHR"].sort_values(ascending=False)[:50]

TESC          0.020916
MT-ND4        0.016959
MT-CYB        0.015083
AHR           0.013504
RGS10         0.013057
RSAD2         0.013046
CTTNBP2       0.012621
KIT           0.010042
MT-CO3        0.009717
MT-ATP6       0.009062
RNF144A       0.008944
PTRF          0.008637
FAM89A        0.008107
MT-ND3        0.007249
MT-ND5        0.006460
CMPK2         0.006035
TIPARP        0.005887
LGALS1        0.005679
LMNA          0.005200
MT-CO2        0.004868
DMTN          0.004549
TUBB2B        0.004404
BHLHE40       0.004267
MT-ND6        0.004078
SH3BGRL3      0.004074
RGS6          0.003886
APOE          0.003876
MYBL2         0.003854
F2RL3         0.003563
ARPC1B        0.003537
MT1F          0.003181
MT-CO1        0.003040
CHI3L2        0.002986
SEPP1         0.002859
AC006262.5    0.002842
ATP6V0A1      0.002836
GMPPA         0.002750
LAPTM5        0.002541
HEMGN         0.002362
NLRP2         0.002345
S100A13       0.002268
ABCC4         0.002255
TNFSF12       0.002106
BTG2       

In [33]:
b["AHR"].sort_values(ascending=False)

TESC             0.016024
MT-ND4           0.014277
MT-CYB           0.012985
RSAD2            0.011355
AHR              0.009069
                   ...   
CTC-384G19.1     0.000000
RP11-168E17.1    0.000000
CPEB2-AS1        0.000000
CD38             0.000000
C10orf107        0.000000
Name: AHR, Length: 8192, dtype: float32

In [38]:
q = a["AHR"].sort_values(ascending=False)[:50]
qq = b["AHR"].sort_values(ascending=False)[:50]

In [44]:
len(set(q.index).intersection(set(qq.index)))

30