In [19]:
import os
import sys
import pandas as pd
import numpy as np
import glob
import time
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc
from datasketch import MinHash, MinHashLSH

In [2]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/by_chromosome/singlecell_mESC_1000000_chr11.h5ad"

start_time = time.time()  # Record the start time
adata = sc.read_h5ad(fpath)
end_time = time.time()  # Record the end time
print(f"Time taken to read the file: {end_time - start_time:.2f} seconds")
sc.logging.print_memory_usage()
adata

Time taken to read the file: 16.98 seconds
Memory usage: current 1.33 GB, difference +1.33 GB


AnnData object with n_obs × n_vars = 119 × 612987
    obs: 'bin_index', 'bin_start', 'bin_end', 'bin', 'chrom', 'chrom_bin', 'degree', 'genes', 'n_genes', 'ATACSeq_1', 'ATACSeq_2', 'ATACSeq_3', 'CTCF', 'H3K27ac', 'H3K27me3', 'RNA_1', 'RNA_2', 'RNA_3', 'RNA_4', 'RNA_5', 'RNA_6', 'chrom_degree'
    var: 'read_index', 'basename', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp', 'genes', 'n_genes', 'chrom_order'
    uns: 'base_resolution', 'chrom_sizes', 'gene_map'

In [42]:
def find_similar_entries_minhash(arr, threshold=0.5, num_perm=128):
    """Finds similar entries in an array using MinHash for Jaccard similarity approximation.

    Args:
        arr (numpy.ndarray): The input array of strings representing sets.
        threshold (float): The minimum Jaccard similarity for two sets to be considered similar.
        num_perm (int): The number of permutations used for MinHash.

    Returns:
        list: A list of lists, where each inner list contains similar entries.
    """
    
    minhashes = {}
    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)

    for i, entry in enumerate(arr):
        m = MinHash(num_perm=num_perm)
        for val in map(int, entry.split(';')):
            m.update(str(val).encode('utf8'))  # Correct encoding for integers
        lsh.insert(str(i), m)
        minhashes[i] = m

    similar_groups = []
    processed = set()
    for i in range(len(arr)):
        if i in processed:
            continue
        similar = [arr[i]]
        result = lsh.query(minhashes[i])  
        for key in result:
            j = int(key)
            if j != i and j not in processed:  # Avoid self and duplicates
                similar.append(arr[j])
                processed.add(j)  # Mark as processed
        if len(similar) > 1:
            similar_groups.append(similar)

    # we only need the unique bin pairs to flag duplicates
    similar_groups = [list(set(x)) for x in similar_groups] 
    similar_groups = [item for sublist in similar_groups for item in sublist]
    return similar_groups

In [59]:
threshold = 0.5
result = []

def get_edgelist(hyperedge):
    nodes_in_hyperedge = sorted(np.nonzero(hyperedge)[0])
    return ";".join(map(str, nodes_in_hyperedge))

def annotate_column(df, reference_list):
  def get_index(row):
    try:
      return reference_list.index(row)
    except ValueError:
      return -1
  return df.apply(get_index)

for cell_id, group in adata.var.groupby('basename'):

    print(cell_id, len(group))

    # data structure for the hyperedges
    scdata =  adata[:, group.index].copy()
    df = pd.DataFrame.sparse.from_spmatrix(
        scdata.X,
        index=scdata.obs_names,
        columns=scdata.var_names,
    ).T 

    # data structure for the results
    annot = pd.DataFrame({
        'read_id' : df.index,
        'mapping_quality' : scdata.var['mean_mapq'].values
    })
    annot['cell_id'] = cell_id
    annot['exactly_unique'] = np.ravel(~df.duplicated())

    # get hyperedges for hasing 
    hyperedges = df.apply(get_edgelist, axis=1)

    # find and annotate nearly-identical read groups
    duplicated_hyperedges = find_similar_entries_minhash(
        hyperedges, 
        threshold=threshold,
    )
    annot['approximately_unique'] = np.ravel(~hyperedges.isin(duplicated_hyperedges))
    annot['read_group'] = np.ravel(annotate_column(hyperedges, duplicated_hyperedges))
    
    # mark duplicates
    annot['unique'] = (annot.groupby('read_group')['mapping_quality'].transform(
        pd.Series.rank,
        method='first',
        ascending=False) == 1)

    # make sure that truly unique reads are retained
    annot['unique'] = np.where(annot['exactly_unique'], True, annot['unique'])
    annot['unique'] = np.where(annot['approximately_unique'], True, annot['unique'])
    
    result.append(annot)


result = pd.concat(result)
result.head(25)

o1b01 3
o1b02 10866
12


Unnamed: 0,read_id,mapping_quality,cell_id,exactly_unique,approximately_unique,read_group,unique
0,f7e5f4ed-513c-48c0-ad61-155c415f2668,34.333333,o1b01,True,True,-1,True
1,d5667905-26a7-4115-9b2b-15d5a158bf34,19.75,o1b01,True,True,-1,True
2,e5725e1c-5844-4da7-b672-adf89022ab1e_1,41.777778,o1b01,True,True,-1,True
0,0648a3fc-a3cd-47ec-9fcf-6dcada42698b_2,52.333333,o1b02,True,False,0,True
1,3f148e34-5e77-4365-aebb-a43dc2ee31a1,60.0,o1b02,False,False,0,True
2,1fe9644d-d6e5-46f0-85ad-5eda1e2fcc5f,60.0,o1b02,False,False,0,False
3,589eefa2-df2e-4f29-afee-c603178f93d4,60.0,o1b02,False,False,0,False
4,73961b79-9c51-43e8-a118-7aec7c17ad20,60.0,o1b02,False,False,0,False
5,0d3d1058-cec1-4ce1-831a-7ff48b8453bf,60.0,o1b02,False,False,0,False
6,704e4905-613d-45ea-b907-caa1eaa56efa,60.0,o1b02,False,False,0,False


In [52]:
adata.var.head()

Unnamed: 0_level_0,read_index,basename,mean_mapq,median_mapq,n_chromosomes,order,n_bins,read_length_bp,genes,n_genes,chrom_order
read_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3e6dd122-d8f5-562f-86de-fc4e384a667a,66,o3b03,51.272727,60.0,2,22,4,7766,Tcf7;Zfp354c;Ttc6,3,3
6fe26b23-bd46-42ce-ae69-b2fb8d1c5945,969,o3b43,50.631579,60.0,3,38,8,12637,Slc4a4;Bcl6b;Cobl;Snhg15;Tns3,5,6
854ca2b7-2a70-4a17-a55c-0a552ec4d99c,970,o3b43,49.487179,60.0,2,39,7,12769,Bcl6b;Cobl;Snhg15;Tns3,4,6
f15777ac-4735-4176-9d79-6fc1b35a6e4b,972,o3b43,54.047619,60.0,3,42,8,13130,Slc4a4;Bcl6b;Cobl;Snhg15;Tns3,5,6
fe4b371f-28bd-45eb-8395-a74eccc2e2d8,976,o3b43,52.853659,60.0,3,41,8,13166,Slc4a4;Bcl6b;Cobl;Snhg15;Tns3,5,6


In [49]:
hyperedges.isin(duplicated_hyperedges)

read_name
0648a3fc-a3cd-47ec-9fcf-6dcada42698b_2     True
3f148e34-5e77-4365-aebb-a43dc2ee31a1       True
1fe9644d-d6e5-46f0-85ad-5eda1e2fcc5f       True
589eefa2-df2e-4f29-afee-c603178f93d4       True
73961b79-9c51-43e8-a118-7aec7c17ad20       True
                                          ...  
454a829f-345b-4bc8-85d7-d6f79e1b6dea      False
d52ca40a-db76-50a8-8b70-4dc66d6484c7      False
f61ca7ab-136e-45f0-882c-80bbd068e8df       True
adac49ef-c2fd-44b8-b215-0af1b665cfa0       True
ea1b3a50-a952-4b06-91bc-e18a8c522b43       True
Length: 10866, dtype: bool

In [7]:
?scdata.to_df

[0;31mSignature:[0m [0mscdata[0m[0;34m.[0m[0mto_df[0m[0;34m([0m[0mlayer[0m[0;34m:[0m [0;34m'str | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m)[0m [0;34m->[0m [0;34m'pd.DataFrame'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generate shallow :class:`~pandas.DataFrame`.

The data matrix :attr:`X` is returned as
:class:`~pandas.DataFrame`, where :attr:`obs_names` initializes the
index, and :attr:`var_names` the columns.

* No annotations are maintained in the returned object.
* The data matrix is densified in case it is sparse.

Params
------
layer
    Key for `.layers`.

Returns
-------
Pandas DataFrame of specified data matrix.
[0;31mFile:[0m      ~/miniconda3/envs/scanpy/lib/python3.12/site-packages/anndata/_core/anndata.py
[0;31mType:[0m      method