In [1]:
import os
import sys
import pandas as pd
import numpy as np
import glob
import time
import gget
import scipy
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import random
from importlib import reload
import warnings
import ot
from scipy.spatial.distance import pdist, squareform

import surprise as sup

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

"""WARNING: no warnings"""
warnings.filterwarnings("ignore")

# local imports
import anndata_utils as anntools

source_path = os.path.abspath("../source/")
sys.path.append(source_path)
import centrality as central
import matrix
import utils as ut
import plotting as plt2

In [2]:
resolution = 1000000
chrom = "chr2"

dpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/by_chromosome/"

file_list = sorted(glob.glob(f"{dpath}*_{resolution}_{chrom}*"))
print(file_list)

population_path = file_list[0]
singlecell_path = file_list[1]

print()

print(f"{population_path=}")
print(f"{singlecell_path=}")

['/scratch/indikar_root/indikar1/shared_data/higher_order/by_chromosome/population_mESC_1000000_chr2.h5ad', '/scratch/indikar_root/indikar1/shared_data/higher_order/by_chromosome/singlecell_mESC_1000000_chr2.h5ad']

population_path='/scratch/indikar_root/indikar1/shared_data/higher_order/by_chromosome/population_mESC_1000000_chr2.h5ad'
singlecell_path='/scratch/indikar_root/indikar1/shared_data/higher_order/by_chromosome/singlecell_mESC_1000000_chr2.h5ad'


# load population

In [3]:
start_time = time.time()  # Record the start time
adata = sc.read_h5ad(population_path)
end_time = time.time()  # Record the end time
print(f"Time taken to read the file: {end_time - start_time:.2f} seconds")
sc.logging.print_memory_usage()
adata

Time taken to read the file: 4.35 seconds
Memory usage: current 0.95 GB, difference +0.95 GB


AnnData object with n_obs × n_vars = 179 × 186499
    obs: 'bin_index', 'bin_start', 'bin_end', 'bin', 'chrom', 'chrom_bin', 'degree', 'genes', 'n_genes', 'ATACSeq_1', 'ATACSeq_2', 'ATACSeq_3', 'CTCF', 'H3K27ac', 'H3K27me3', 'RNA_1', 'RNA_2', 'RNA_3', 'RNA_4', 'RNA_5', 'RNA_6', 'chrom_degree'
    var: 'read_index', 'basename', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp', 'genes', 'n_genes', 'chrom_order'
    uns: 'base_resolution', 'chrom_sizes', 'gene_map'

# load single-cell

In [4]:
start_time = time.time()  # Record the start time
bdata = sc.read_h5ad(singlecell_path)
end_time = time.time()  # Record the end time
print(f"Time taken to read the file: {end_time - start_time:.2f} seconds")
sc.logging.print_memory_usage()
bdata

Time taken to read the file: 32.27 seconds
Memory usage: current 2.84 GB, difference +1.89 GB


AnnData object with n_obs × n_vars = 179 × 1865516
    obs: 'bin_index', 'bin_start', 'bin_end', 'bin', 'chrom', 'chrom_bin', 'degree', 'genes', 'n_genes', 'ATACSeq_1', 'ATACSeq_2', 'ATACSeq_3', 'CTCF', 'H3K27ac', 'H3K27me3', 'RNA_1', 'RNA_2', 'RNA_3', 'RNA_4', 'RNA_5', 'RNA_6', 'chrom_degree'
    var: 'read_index', 'basename', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp', 'genes', 'n_genes', 'chrom_order'
    uns: 'base_resolution', 'chrom_sizes', 'gene_map'

# QC

In [5]:
def find_outliers_iqr(df_column):
  """
  Identifies outliers in a pandas DataFrame column using the IQR method.

  Args:
    df_column: A pandas Series representing the column to analyze.

  Returns:
    A boolean mask with True for outliers and False otherwise.
  """
  Q1 = df_column.quantile(0.15)
  Q3 = df_column.quantile(0.85)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  return (df_column < lower_bound) | (df_column > upper_bound)

adata.obs['degree_outlier'] = find_outliers_iqr(adata.obs['chrom_degree'])

adata.obs[adata.obs['degree_outlier']][['chrom_bin', 'chrom_degree', 'degree_outlier']].head()

Unnamed: 0_level_0,chrom_bin,chrom_degree,degree_outlier
bin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr2:98,98,93161,True
chr2:177,177,3700,True
chr2:181,181,1340,True


In [6]:
# remove outliers
remove_bins = adata.obs[adata.obs['degree_outlier']].index.to_list()
print(f"Removing top {len(remove_bins)} outlier loci: ")
print(remove_bins)

adata = adata[~adata.obs_names.isin(remove_bins), :].copy()
bdata = bdata[~bdata.obs_names.isin(remove_bins), :].copy()

print('done!')

Removing top 3 outlier loci: 
['chr2:98', 'chr2:177', 'chr2:181']
done!


# Clique-expand

In [7]:
matrix.expand_and_normalize_anndata(adata, oe_kr=True)
print()
matrix.expand_and_normalize_anndata(bdata, oe_kr=True)

adata

Expanding input matrix...
Applying KR normalization...
Applying OE normalization...
Normalization complete.

Expanding input matrix...
Applying KR normalization...
Applying OE normalization...
Normalization complete.


AnnData object with n_obs × n_vars = 176 × 186499
    obs: 'bin_index', 'bin_start', 'bin_end', 'bin', 'chrom', 'chrom_bin', 'degree', 'genes', 'n_genes', 'ATACSeq_1', 'ATACSeq_2', 'ATACSeq_3', 'CTCF', 'H3K27ac', 'H3K27me3', 'RNA_1', 'RNA_2', 'RNA_3', 'RNA_4', 'RNA_5', 'RNA_6', 'chrom_degree', 'degree_outlier'
    var: 'read_index', 'basename', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp', 'genes', 'n_genes', 'chrom_order'
    uns: 'base_resolution', 'chrom_sizes', 'gene_map'
    obsm: 'A', 'A_kr', 'A_oe', 'A_oe_kr'

# Centrality measures

In [8]:
ce_centralities = {
    'ce_eigenvector_centrality' : {
        'function' : nx.eigenvector_centrality,
        'weight' : True
    },
    'ce_betweenness_centrality' : {
        'function' : nx.betweenness_centrality,
        'weight' : True
    },
    'ce_pagerank' : {
        'function' : nx.pagerank,
        'weight' : True
    },
}

obsm_key = 'A_oe'
A = adata.obsm[obsm_key].copy()
# A = A.mask(np.eye(A.shape[0], dtype=bool), 0)

G = nx.from_pandas_adjacency(A)
print(G)

for label, d in ce_centralities.items():
    if d['weight']:
        centrality = d['function'](G, weight='weight')
    else:
        centrality = d['function'](G)
        
    adata.obs[label] = adata.obs.index.map(centrality)
    adata.obs[label] = ut.min_max(adata.obs[label])

adata.obs[list(ce_centralities.keys())].head()

Graph with 176 nodes and 15549 edges


Unnamed: 0_level_0,ce_eigenvector_centrality,ce_betweenness_centrality,ce_pagerank
bin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr2:3,0.369326,0.177194,0.379051
chr2:4,0.418013,0.049221,0.427527
chr2:5,0.54051,0.008203,0.530948
chr2:6,0.522232,0.080394,0.532509
chr2:7,0.310589,0.148482,0.333318


# higher-order centralities

In [9]:
# add the principal singular value of the incidence matrix
H = adata.to_df().copy()
print(f"Raw: {H.shape=}")
H = H.T.drop_duplicates().T
print(f"De-duped: {H.shape=}")

node_weight_attr = 'ATACSeq_1' # must be an obs column
node_weights = adata.obs.loc[H.index, node_weight_attr].values


svd = TruncatedSVD(n_components=1, n_iter=10)
adata.obs['singular_vector_1'] = ut.min_max(svd.fit_transform(H))

# hypergraph centralities
hge_functions = {
    'hge_logexp_unweighted' : {
        'function' : 'log-exp',
        'weights' : None,
    },
    'hge_logexp_degree_weighted' : {
        'function' : 'log-exp',
        'weights' : 1 / (H.sum(axis=1).values + 1),
    },
    'hge_logexp_RNA_weighted' : {
        'function' : 'log-exp',
        'weights' : 1 / (adata.obs.loc[H.index, 'RNA_2'].values + 1)
    },
    'hge_logexp_ATAC_weighted' : {
        'function' : 'log-exp',
        'weights' : 1 / (adata.obs.loc[H.index, 'ATACSeq_1'].values + 1)
    },
}


hge_centralities = []

for label, d in hge_functions.items():
    start_time = time.time()  # Record start time
    node, edge = central.nonlinear_eigenvector_centrality(
        H,
        function=d['function'],
        node_weights=d['weights'],
    )

    hge_centralities.append(label)
    adata.obs[label] = ut.min_max(node)

    end_time = time.time()  # Record end time
    print(f"{label} calculation took: {end_time - start_time:.2f} seconds")

adata

Raw: H.shape=(176, 186499)
De-duped: H.shape=(176, 45833)
hge_logexp_unweighted calculation took: 0.24 seconds
hge_logexp_degree_weighted calculation took: 0.19 seconds
hge_logexp_RNA_weighted calculation took: 0.27 seconds
hge_logexp_ATAC_weighted calculation took: 0.26 seconds


AnnData object with n_obs × n_vars = 176 × 186499
    obs: 'bin_index', 'bin_start', 'bin_end', 'bin', 'chrom', 'chrom_bin', 'degree', 'genes', 'n_genes', 'ATACSeq_1', 'ATACSeq_2', 'ATACSeq_3', 'CTCF', 'H3K27ac', 'H3K27me3', 'RNA_1', 'RNA_2', 'RNA_3', 'RNA_4', 'RNA_5', 'RNA_6', 'chrom_degree', 'degree_outlier', 'ce_eigenvector_centrality', 'ce_betweenness_centrality', 'ce_pagerank', 'singular_vector_1', 'hge_logexp_unweighted', 'hge_logexp_degree_weighted', 'hge_logexp_RNA_weighted', 'hge_logexp_ATAC_weighted'
    var: 'read_index', 'basename', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp', 'genes', 'n_genes', 'chrom_order'
    uns: 'base_resolution', 'chrom_sizes', 'gene_map'
    obsm: 'A', 'A_kr', 'A_oe', 'A_oe_kr'

# Extract the core from population

In [10]:
core_score = 'hge_logexp_RNA_weighted'
core_threshold_quantile = 0.75
order_threshold = 2

vector = adata.obs[core_score].values
threshold = np.quantile(vector, core_threshold_quantile)
core_nodes = adata.obs[adata.obs[core_score] > threshold].index.to_list()

# extract the core from population
core = adata[core_nodes, :].copy()
core = core[:, core.X.sum(axis=0) > order_threshold].copy()

H_core = core.to_df()
print(f"{H_core.shape=}")
core_reads = H_core.columns.to_list()
H_core.columns = [f"core_{x}" for x in H_core.columns]
H_core.head()

H_core.shape=(44, 1674)


Unnamed: 0_level_0,core_915b57dc-980a-47c2-b145-b29dd89bd459,core_01d46d89-bba9-4a8e-8356-bd99b7c903ea,core_13bd8010-e740-4012-9431-2a22aa38a247,core_ef94a24d-87f5-416b-9acb-1e70d7bc3909,core_f336223e-75bd-452b-bf92-5a98ddad84fe,core_2db40a0a-702a-4f8a-8665-70f8add3f681,core_e62f6df2-904a-49ce-a983-5a79f114c247,core_11444ae3-db72-4d13-b82e-14e2551197a1,core_29b1d600-cbf1-4b78-b670-e5e88aed08f3,core_a4d415ed-e369-4f31-99e4-9ee0ef394c9c,...,core_3ea186d6-0050-4b80-ada1-0bc2147cef78,core_e394b19b-10bb-49ab-9b0c-b77b7e58a474,core_61333a65-5967-452d-8536-94c9d05bd786,core_023f27c8-50e3-431e-8298-e6ce4dbd851e,core_18cdb59b-8101-489d-885b-b28c222c5cc5,core_b7f041c9-48ce-4abc-90e4-70ff555b3fe6,core_2d6d77c9-5f71-49b1-8ff8-716fe81e7612,core_da313fb5-dc98-496d-af2b-8441eb86e609,core_5cf1aa53-9e3d-408f-9fc0-485395ec4960,core_8c0004d4-7c36-486d-a21d-d79a51e0a86b
bin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr2:4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
chr2:11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr2:25,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
chr2:26,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,0,0
chr2:27,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# create a set of single-cell incidence matrices

In [11]:
incidence_matrices = {}
cell_ids = bdata.var['basename'].unique()
num_cells = len(cell_ids)
total_time = 0.0  # Initialize cumulative time

for i, cell_id in enumerate(cell_ids):
    start_time = time.time()

    # extract the single-cell
    sc_data = bdata[:, bdata.var['basename'] == cell_id].copy()
    H_o = sc_data.to_df()
    H_o = H_o.T.drop_duplicates().T  # Transpose, drop duplicates, transpose back
    H_o.columns = [f"{cell_id}_{x}" for x in H_o.columns]

    incidence_matrices[cell_id] = H_o

    # Update timing information
    elapsed_time = time.time() - start_time
    total_time += elapsed_time

    # Periodic status updates
    if (i+1) % 50 == 0:
        print(f"Processed {i+1}/{num_cells} cells. "
              f"Time for last cell: {elapsed_time:.2f} seconds, "
              f"Cumulative time: {total_time:.2f} seconds")

print("Finished processing all cells.")

Processed 50/373 cells. Time for last cell: 0.59 seconds, Cumulative time: 17.85 seconds
Processed 100/373 cells. Time for last cell: 0.30 seconds, Cumulative time: 35.02 seconds
Processed 150/373 cells. Time for last cell: 0.60 seconds, Cumulative time: 52.03 seconds
Processed 200/373 cells. Time for last cell: 0.23 seconds, Cumulative time: 68.42 seconds
Processed 250/373 cells. Time for last cell: 0.59 seconds, Cumulative time: 84.96 seconds
Processed 300/373 cells. Time for last cell: 0.23 seconds, Cumulative time: 101.17 seconds
Processed 350/373 cells. Time for last cell: 0.60 seconds, Cumulative time: 128.73 seconds
Finished processing all cells.


In [12]:
# break

# Train the model on population

In [13]:
core_dict = adata.obs[core_score].to_dict()
n_levels = 5

# the entire population data
H = adata.to_df().copy()
H = H.T.drop_duplicates().T
H.columns = [f"pop_{x}" for x in H.columns]
print(f"{H.shape=}")

# concat
H = H.fillna(0.0)
print(f"{H.shape=}")

# reshape 
H = H.reset_index(drop=False)
H = pd.melt(
    H, 
    id_vars='bin_name',
    var_name='read_name',
    value_name='value',
)

# filter low-order contacts and nan bins
H = H[H['value'] > 0].reset_index(drop=True)
H['order'] = H.groupby('read_name')['value'].transform('sum')
H = H[H['order'] > 1].reset_index(drop=True)
print(f"(melted) {H.shape=}")

# build the bin-hyperedge rating
def softmax(x):
    """Calculates the softmax of a numpy array."""
    e_x = np.exp(x - np.max(x))  # For numerical stability
    return e_x / e_x.sum()

H['core_score'] = H['bin_name'].map(core_dict)
H['weight'] = H.groupby('read_name')['core_score'].transform(softmax)
H['rating'] = pd.cut(H['weight'], bins=5, labels=range(1, 6))

H = H.rename(columns={
    'read_name': 'userID',
    'bin_name': 'itemID',
})

print(f"(final) {H.shape=}")

H.head()

H.shape=(176, 45833)
H.shape=(176, 45833)
(melted) H.shape=(142625, 4)
(final) H.shape=(142625, 7)


Unnamed: 0,itemID,userID,value,order,core_score,weight,rating
0,chr2:77,pop_2e567d50-8451-4643-a981-1cd91a0062c0,1,2,0.231878,0.427254,3
1,chr2:80,pop_2e567d50-8451-4643-a981-1cd91a0062c0,1,2,0.524944,0.572746,4
2,chr2:142,pop_32e53ef8-c394-4db7-bb72-e29a3281a26f,1,2,0.162095,0.425022,3
3,chr2:166,pop_32e53ef8-c394-4db7-bb72-e29a3281a26f,1,2,0.464285,0.574978,4
4,chr2:119,pop_ef7ff38e-a5fe-4692-b270-9a704ffa2981,1,2,0.485023,0.559839,4


In [14]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# K-fold trainning 

In [None]:
reader = sup.Reader(rating_scale=(1, 5))
data = sup.Dataset.load_from_df(H[["userID", "itemID", "rating"]], reader)

algorithms = {
    "baseline" : sup.BaselineOnly(),
    "SVD" : sup.SVD(),
    "SVD++" : sup.SVDpp(),
    "NMF" : sup.NMF(),
}

for label, algo in algorithms.items():
    # Run 5-fold cross-validation and print results
    print(f"\n------- {label} -------")
    res = sup.model_selection.cross_validate(
        algo, 
        data,
        measures=["RMSE", "MAE"], 
        cv=5,
        verbose=True,
    )

# GridSearch

In [None]:
# algo = sup.SVDpp

# param_grid = {
#     "n_factors" : [1, 5, 10, 25],
#     "n_epochs": [10, 20],
#     "lr_all": [0.002, 0.005],
#     "reg_all": [0.2, 0.4, 0.6, 0.8],
# }

# gs = sup.model_selection.GridSearchCV(
#     algo, 
#     param_grid, 
#     measures=["rmse", "mae"], 
#     cv=5,
# )

# gs.fit(data)

# # best RMSE score
# print(gs.best_score["rmse"])

# # combination of parameters that gave the best RMSE score
# print(gs.best_params["rmse"])

# Make predictions

In [None]:
key = 'SVD++'
algo = algorithms[key]
preds = {}
stop = 10
cell_count = -1

for cell_id, Ho in incidence_matrices.items():
    cell_count += 1
    if cell_count == stop:
        break
    
    print(cell_id)

    """COULD MAKE UP NEW HYPEREDGES """
    H_pred = []
    for read_name in Ho.columns.to_list():
        for bin_name in Ho.index.to_list():
            pred = algo.predict(uid=read_name, iid=bin_name)
            row = {
                'read_name': read_name,
                'bin_name': bin_name,
                'value': pred.est,
            }
            H_pred.append(row)

    H_pred = pd.DataFrame(H_pred)
    H_pred = pd.pivot_table(
        H_pred,
        index='bin_name',
        columns='read_name',
        values='value',
        fill_value=0.0,
    )
    H_pred = H_pred.reindex(Ho.index)

    preds[cell_id] = H_pred

print(f"done!")

In [None]:
cell_id = 'o1b71'

Ho = incidence_matrices[cell_id]
Hp = preds[cell_id]

print(f"{Ho.shape=}")
print(f"{Hp.shape=}")

# mask existing edges
H_imputed = Ho.copy()
print(f"{H_imputed.shape=}") 

# # incidence matrices
fig, axs = plt.subplots(1, 3, sharey=True)
axs[0].imshow(Ho, vmin=0, vmax=1)
axs[1].imshow(Hp, vmin=1, vmax=5)
# axs[2].imshow(H_imputed)
# plt.tight_layout()
# plt.show()

# # clique-expansion
# fig, axs = plt.subplots(1, 2, sharey=True)
# axs[0].imshow(np.dot(Ho, Ho.T))
# axs[1].imshow(np.dot(H_imputed, H_imputed.T))
# plt.tight_layout()
# plt.show()

In [None]:
incidence_matrices[cell_id].head()

In [None]:
preds[cell_id].head()

In [None]:
np.min(preds[cell_id])

In [None]:
break

In [None]:
break

In [None]:
core_dict = adata.obs[core_score].to_dict()
n_levels = 5
preds = {}

start_time = time.time()  # Start the overall timer
cumulative_time = 0  # Initialize cumulative time

n_cells = 2
cell_counter = -1

for i, (cell_id, H) in enumerate(incidence_matrices.items()):
    cell_counter += 1
    if cell_counter == n_cells:
        break
    loop_start_time = time.time()  # Start the loop timer
    print(f"Processing cell {i+1}/{len(incidence_matrices)} (cell_id: {cell_id})")

    # add the core hyperedges
    add_to = adata.to_df().copy()
    add_to = add_to.T.drop_duplicates().T
    H_imp = pd.concat([H, add_to], axis=1, ignore_index=False)
    H_imp = H_imp.fillna(0.0)
    print(f"{H_imp.shape=}")

    # reshape 
    H_imp = H_imp.reset_index(drop=False)
    H_imp = pd.melt(H_imp, id_vars='bin_name')
    
    H_imp['core_score'] = H_imp['bin_name'].map(core_dict)
    H_imp['weighted_value'] = H_imp['core_score'] * H_imp['value']
    H_imp['discretized_value'] = (H_imp['weighted_value'] * n_levels).astype(int)
    H_imp = H_imp.rename(columns={
        'variable': 'userID',
        'bin_name': 'itemID',
        'discretized_value' : 'rating',
    })

    reader = sup.Reader(rating_scale=(0, n_levels))
    data = sup.Dataset.load_from_df(H_imp[["userID", "itemID", "rating"]], reader)
    trainset = data.build_full_trainset()
    
    algo = sup.SVD()
    algo.fit(trainset)

    """COULD MAKE UP NEW HYPEREDGES """
    H_pred = []
    for read_name in H.columns.to_list():
        for bin_name in H.index.to_list():
            pred = algo.predict(uid=read_name, iid=bin_name)
            row = {
                'read_name': read_name,
                'bin_name': bin_name,
                'value': pred.est,
            }
            H_pred.append(row)

    H_pred = pd.DataFrame(H_pred)
    H_pred = pd.pivot_table(
        H_pred,
        index='bin_name',
        columns='read_name',
        values='value',
        fill_value=0.0,
    )

    preds[cell_id] = H_pred

    loop_end_time = time.time()  # End the loop timer
    cell_time = loop_end_time - loop_start_time
    cumulative_time += cell_time  # Update cumulative time
    print(f"Cell processed in {cell_time:.2f} seconds (cumulative: {cumulative_time:.2f} seconds)")
    break

end_time = time.time()  # End the overall timer
print(f"Total processing time: {end_time - start_time:.2f} seconds")

In [None]:
break

In [None]:
cell_id = 'o2b67'
fig, axs = plt.subplots(1, 2, sharey=True)

axs[0].imshow(incidence_matrices[cell_id])
axs[1].imshow(preds[cell_id])
plt.tight_layout()

In [None]:
break

In [None]:
# core_dict = adata.obs[core_score].to_dict()

# preds = {}


# for cell_id, H in incidence_matrices.items():
    
#     read_names = H.columns.to_list()
#     bin_names = H.index.to_list()
#     print(f"{H.shape=}")

#     # append the  core
#     H = pd.concat([H, H_core], axis=1, ignore_index=False)
#     H = H.fillna(0.0)
#     print(f"{H.shape=}")

#     # develop the dataset from the combined data
#     H = H.reset_index(drop=False)
#     H = pd.melt(H, id_vars='bin_name')
#     H['core_score'] = H['bin_name'].map(core_dict)
#     H['rating'] = H['core_score'] * H['value']
#     H = H.rename(columns={
#         'variable' : 'userID',
#         'bin_name' : 'itemID',
#     })

#     # build the data set
#     reader = Reader(rating_scale=(0, 1))
#     data = Dataset.load_from_df(H[["userID", "itemID", "rating"]], reader)
#     trainset = data.build_full_trainset()

#     # Build an algorithm, and train it.
#     algo = SVD()
#     algo.fit(trainset)

#     # predict only on sc hyperedges
#     H_pred = []
#     for read_name in read_names:
#         for bin_name in bin_names:
#             pred = algo.predict(uid=read_name, iid=bin_name)
#             row = {
#                 'read_name' : read_name,
#                 'bin_name' : bin_name,
#                 'value' : pred.est,
#             }
#             H_pred.append(row)

#     H_pred = pd.DataFrame(H_pred)
#     H_pred = pd.pivot_table(
#         H_pred,
#         index='bin_name',
#         columns='read_name',
#         values='value',
#         fill_value=0.0,
#     )

#     print(f"{H_pred.shape=}")

#     preds[cell_id] = H_pred



In [None]:
dir(pred)

In [None]:
break