In [1]:
import h5py
import pandas as pd
from scipy.sparse import csc_matrix

In [2]:
def load_hdf5_as_dataframe(h5_file_path):
    """
    Loads an HDF5 file containing a sparse gene expression matrix and converts it to a dense pandas DataFrame.

    Parameters:
        h5_file_path (str): Path to the HDF5 file.

    Returns:
        pd.DataFrame: Gene expression matrix with rows as cell barcodes and columns as gene names.
    """
    with h5py.File(h5_file_path, "r") as f:
        # Extract matrix components
        matrix_group = f["matrix"]
        data = matrix_group["data"][:]
        indices = matrix_group["indices"][:]
        indptr = matrix_group["indptr"][:]
        shape = tuple(matrix_group["shape"][:])

        # Build sparse matrix
        sparse_matrix = csc_matrix((data, indices, indptr), shape=shape)

        # Extract barcodes (row identifiers) and gene names (column identifiers)
        cell_barcodes = matrix_group["barcodes"][:].astype(str)
        gene_names = matrix_group["features"]["name"][:].astype(str)

        # Convert sparse matrix to a dense DataFrame
        expression_df = pd.DataFrame(sparse_matrix.T.toarray(), index=cell_barcodes, columns=gene_names)

    return expression_df

In [3]:
xenium_v1_df = load_hdf5_as_dataframe("cell_feature_matrix_v1.h5")
xenium_v2_df = load_hdf5_as_dataframe("cell_feature_matrix_v2.h5")

In [4]:
print("Xenium v2 Shape:", xenium_v2_df.shape)  # Should be (num_cells, 5000)
print("Xenium v1 Shape:", xenium_v1_df.shape)  # Should be (num_cells, 300)

Xenium v2 Shape: (278328, 10029)
Xenium v1 Shape: (278659, 541)


In [5]:
print("Xenium v1 Preview:\n", xenium_v1_df.head())

Xenium v1 Preview:
             ACE  ACE2  ACKR1  ADAM17  ADAM28  ADAMTS1  ADGRL4  AGER  AGR3  \
aaaadcac-1    0     0      0       0       0        0       0     0     2   
aaaaieak-1    0     0      0       0       0        0       0     0    11   
aaaanbhm-1    2     0      0       5       0        0       0     0     1   
aaaankfe-1    0     0      0       1       0        0       0     0    27   
aaaankfi-1    0     0      0       0       0        0       0     0     0   

            AIF1  ...  UnassignedCodeword_0493  UnassignedCodeword_0494  \
aaaadcac-1     0  ...                        0                        0   
aaaaieak-1     0  ...                        0                        0   
aaaanbhm-1     2  ...                        0                        0   
aaaankfe-1     0  ...                        0                        0   
aaaankfi-1     0  ...                        0                        0   

            UnassignedCodeword_0495  UnassignedCodeword_0496  \
aa

In [6]:
print("Xenium v2 Preview:\n", xenium_v2_df.head())

Xenium v2 Preview:
             A2ML1  AAMP  AAR2  AARSD1  ABAT  ABCA1  ABCA3  ABCA4  ABCA7  \
aaaaadnb-1      0     0     0       0     0      0      0      0      0   
aaaabalp-1      0     0     0       0     0      0      0      0      0   
aaaadfei-1      0     0     0       0     0      0      0      0      0   
aaaadjia-1      0     0     0       0     0      0      0      0      0   
aaaafglb-1      0     0     0       2     0      0      0      0      1   

            ABCB1  ...  DeprecatedCodeword_18589  DeprecatedCodeword_18601  \
aaaaadnb-1      0  ...                         0                         0   
aaaabalp-1      0  ...                         0                         0   
aaaadfei-1      0  ...                         0                         0   
aaaadjia-1      0  ...                         0                         0   
aaaafglb-1      1  ...                         0                         0   

            DeprecatedCodeword_18609  DeprecatedCodeword_186

In [7]:
# Check for missing values
print("Missing values in Xenium v2:", xenium_v2_df.isnull().sum().sum())
print("Missing values in Xenium v1:", xenium_v1_df.isnull().sum().sum())

Missing values in Xenium v2: 0
Missing values in Xenium v1: 0


In [8]:
# Find common cells
common_cells = set(xenium_v1_df.index) & set(xenium_v2_df.index)
print(f"Number of common cells: {len(common_cells)}")

Number of common cells: 16


In [12]:
print(common_cells)

['fppfflga-1', 'gdphoaij-1', 'nodkllil-1', 'ekkeebih-1', 'jbckkggo-1', 'cclepell-1', 'dkhjafpj-1', 'fcilpdfj-1', 'gkblabnd-1', 'fdapelpa-1', 'kmingbmg-1', 'mkilfjpa-1', 'nfjajmgc-1', 'kjcnelmc-1', 'bembdein-1', 'hfdbkegl-1']


In [9]:
# Find common genes
common_genes = set(xenium_v1_df.columns) & set(xenium_v2_df.columns)
print(f"Number of common genes: {len(common_genes)}")

Number of common genes: 232


In [13]:
print(common_genes)

['CD1A', 'MS4A2', 'TSPAN8', 'FGFBP2', 'KLF5', 'KLRC1', 'SLC7A11', 'CDK1', 'TNFRSF17', 'KDR', 'TOP2A', 'NegControlProbe_00002', 'UnassignedCodeword_0120', 'SOX2', 'NegControlProbe_00019', 'TP63', 'UnassignedCodeword_0241', 'CFTR', 'UnassignedCodeword_0428', 'CD19', 'CCNA1', 'TREM2', 'FCGR3A', 'SELE', 'FCMR', 'CD40', 'NFKB1', 'SLC1A3', 'FGFR4', 'LILRA5', 'BANK1', 'GLIPR2', 'TNFRSF18', 'FSCN1', 'POU2AF1', 'CTSL', 'CD3E', 'NegControlProbe_00035', 'FOXP3', 'PLN', 'WNT2', 'ADGRL4', 'MMP9', 'CD300E', 'ANPEP', 'FOXI1', 'SLC2A1', 'CD68', 'NegControlProbe_00042', 'NegControlProbe_00031', 'SYK', 'UnassignedCodeword_0437', 'MUC1', 'NegControlProbe_00034', 'GZMA', 'UnassignedCodeword_0221', 'TRPC6', 'PIM2', 'RETN', 'PDGFRB', 'FCN1', 'IQGAP2', 'TCL1A', 'AGER', 'FCGR1A', 'SOX9', 'GJA5', 'STEAP4', 'EHF', 'ACE2', 'RUNX3', 'CXCL5', 'SPIB', 'FASN', 'LGR5', 'FAS', 'PDPN', 'SERPINA3', 'PCNA', 'CD34', 'LTBP2', 'MUC5B', 'ACE', 'TC2N', 'UPK3B', 'GZMK', 'MAP7', 'MET', 'KLRD1', 'LILRB4', 'MMP12', 'HIF1A', 'STAT

In [18]:
max_length = max(len(xenium_v1_df.index), len(xenium_v2_df.index))

v1_barcodes = pd.Series(xenium_v1_df.index, name="Xenium_v1_Cells").reindex(range(max_length))
v2_barcodes = pd.Series(xenium_v2_df.index, name="Xenium_v2_Cells").reindex(range(max_length))

cell_mapping_df = pd.DataFrame({"Xenium_v2_Cells": v2_barcodes, "Xenium_v1_Cells": v1_barcodes})
cell_mapping_df.to_csv("cell_barcodes_mapping.csv", index=False)

print("CSV file 'cell_barcodes_mapping.csv' has been saved.")

CSV file 'cell_barcodes_mapping.csv' has been saved.
