In [16]:
import pandas as pd
from sklearn import metrics
import scipy
from scipy.sparse import csr_matrix
from scipy.sparse import csgraph
from scipy.sparse.csgraph import connected_components
from scipy.sparse.linalg import eigsh
import os
import glob
import matplotlib.colors as mcolors 
import sys
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
import cooler

tool_path = "/home/cstansbu/miniconda3/lib/python3.9/site-packages/"
sys.path.append(tool_path)
import graph_tool.all as gt
print(f"{gt.__version__=}")

# locals 
import reconstruction as recon

sys.path.append("../")
import utils as ut
import plotting as plt2

gt.__version__='2.58 (commit c513b9b6, )'


# Single-cell

In [2]:
# load the reference
fpath = "/nfs/turbo/umms-indikar/shared/projects/poreC/data/nagano2017/schic2_mm9/seq/redb/GATC.fends"
ref = pd.read_csv(fpath, sep='\t')
ref.head()

  ref = pd.read_csv(fpath, sep='\t')


Unnamed: 0,fend,chr,coord
0,0,1,2
1,1,1,3000535
2,2,1,3000536
3,3,1,3000800
4,4,1,3000801


In [38]:
ref['chr'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       '19', 'X', 'Y'], dtype=object)

In [37]:
def sample_file_paths(directory, num_samples):
    """Randomly samples file paths from subdirectories within a given directory.

    Args:
        directory: The root directory to search within.
        num_samples: The number of file paths to sample.

    Returns:
        list: A list of randomly sampled full file paths.
    """

    all_files = []
    for subdir in os.listdir(directory):
        full_subdir = os.path.join(directory, subdir)
        if os.path.isdir(full_subdir):
            all_files.extend(glob.glob(os.path.join(full_subdir, "*")))  # Find files in each subdir

    if num_samples > len(all_files):
        raise ValueError("Number of samples requested exceeds available files.")

    sampled_paths = random.sample(all_files, num_samples)
    return sampled_paths


def process_nagano(df, binarize=True, normalize=False):
    """Processes the Nagano dataset for analysis.

    Args:
        df: The input DataFrame.
        binarize: If True, binarize the contact matrix after processing (default: True).
        normalize:  If True, apply normalization to the contact matrix (default: False).

    Returns:
        pandas.DataFrame: The processed DataFrame.
    """

    # Filter for symmetric bin pairs
    df = df[df['bin_1'].isin(df['bin_2'])]
    df = df[df['bin_2'].isin(df['bin_1'])]

    # Create a pivot table
    df = pd.pivot_table(
        df, index='bin_1', 
        columns='bin_2',
        values='count',
        aggfunc='sum',
        fill_value=0
    )

    # Symmetrize the pivot table
    df = ut.symmetrize(df)

    # Optional Normalization
    if normalize:
        df = ut.normalize_oe(ut.normalize_kr(df).todense())

    # Optional Binarization
    if binarize:
        df = np.where(df > 0, 1, 0)  

    return df


def join_fend_info(ref, cell_matrix):
    """Joins fend information onto 'fend1' and 'fend2' columns of the DataFrames.

    Args:
        ref: The DataFrame containing fend, chr, coord, bin information.
        cell_matrix: The DataFrame containing fend1, fend2, and count columns.

    Returns:
        pandas.DataFrame: A new DataFrame with the joined fend information.
    """

    # Merge fend info onto cell_matrix (fend1)
    cell_matrix = cell_matrix.merge(ref, left_on='fend1', right_on='fend', how='left')
    cell_matrix = cell_matrix.rename(columns={'chr': 'fend1_chr', 'coord': 'fend1_coord', 'bin': 'fend1_bin'})

    # Merge fend info onto cell_matrix (fend2)
    cell_matrix = cell_matrix.merge(ref, left_on='fend2', right_on='fend', how='left') 
    cell_matrix = cell_matrix.rename(columns={'chr': 'fend2_chr', 'coord': 'fend2_coord', 'bin': 'fend2_bin'})

    # Keep only necessary columns
    result_df = cell_matrix[['fend1', 'fend2', 'count', 
                             'fend1_chr', 'fend1_coord', 'fend1_bin',
                             'fend2_chr', 'fend2_coord', 'fend2_bin']]

    return result_df


# sample files
dpath = "/nfs/turbo/umms-indikar/shared/projects/poreC/data/nagano2017/matrices/"
sample_size = 10
sample_files = sample_file_paths(dpath, sample_size)
sample_files = [f"{x}/adj" for x in sample_files]

# contact specs
resolution = 1000000


nagano = []

for fpath in sample_files:
    
    cell_matrix = pd.read_csv(fpath, sep='\t')
    cell_matrix = join_fend_info(ref, cell_matrix)
    print(cell_matrix.head())
    break
    # tmp['cell_id'] = fpath.split("/")[-2]
    # nagano.append(tmp)
    # break

# nagano = pd.concat(nagano)
# print(nagano['cell_id'].nunique())
# nagano.head()

      fend1     fend2  count fend1_chr  fend1_coord  fend1_bin fend2_chr  \
0   9977333   9977334      2        15     37848611         38        15   
1   1848047   1951954      1         3      4107806          5         3   
2  10472142  10479881      1        16     36641382         37        16   
3   9822238   9817696      1        15      6164907          7        15   
4   2980061   2709566      1         4     73352160         74         4   

   fend2_coord  fend2_bin  
0     37848612         38  
1     23460493         24  
2     38300806         39  
3      5316370          6  
4     19777031         20  


In [32]:
tmp.head()

Unnamed: 0,fend1,fend2,count
0,3926948,3927468,2
1,8614164,8614159,1
2,6654745,6654746,2
3,10016405,10016677,1
4,3133430,3133429,1


In [5]:
counts = nagano['cell_id'].value_counts()
counts

1CDX1.253    23116
1CDX1.342    20509
1CDX1.486    18332
1CDX1.484    17976
1CDX1.177    16951
1CDX1.82     13803
1CDX1.125    12860
1CDX1.32      6893
1CDX1.121     3386
1CDX1.288      111
Name: cell_id, dtype: int64

In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [7]:
def process_nagano(df, binarize=True, normalize=False):
    """Processes the Nagano dataset for analysis.

    Args:
        df: The input DataFrame.
        binarize: If True, binarize the contact matrix after processing (default: True).
        normalize:  If True, apply normalization to the contact matrix (default: False).

    Returns:
        pandas.DataFrame: The processed DataFrame.
    """

    # Filter for symmetric bin pairs
    df = df[df['bin_1'].isin(df['bin_2'])]
    df = df[df['bin_2'].isin(df['bin_1'])]

    # Create a pivot table
    df = pd.pivot_table(
        df, index='bin_1', 
        columns='bin_2',
        values='count',
        aggfunc='sum',
        fill_value=0
    )

    # Symmetrize the pivot table
    df = ut.symmetrize(df)

    # Optional Normalization
    if normalize:
        df = ut.normalize_oe(ut.normalize_kr(df).todense())

    # Optional Binarization
    if binarize:
        df = np.where(df > 0, 1, 0)  

    return df

In [13]:
outpath = "/nfs/turbo/umms-indikar/shared/projects/poreC/hypergraphs/single_cell_hic/"

for cell_id, group in nagano.groupby('cell_id'):
    mat = process_nagano(group)
    
    # require the full chromosome
    if not mat.shape == (193, 193):
        continue
                
    print(cell_id, mat.shape)


1CDX1.125 (193, 193)
1CDX1.177 (193, 193)
1CDX1.253 (193, 193)
1CDX1.32 (193, 193)
1CDX1.342 (193, 193)
1CDX1.484 (193, 193)
1CDX1.486 (193, 193)
1CDX1.82 (193, 193)


In [None]:
reload(ut)
Asc = nagano[nagano['cell_id'] == counts.index[0]]
print(Asc['cell_id'].unique())

# force symmetric bin pairs
Asc = Asc[Asc['bin_1'].isin(Asc['bin_2'])]
Asc = Asc[Asc['bin_2'].isin(Asc['bin_1'])]


print(f"{Asc.shape=}")
Asc = pd.pivot_table(Asc, 
                   index='bin_1', 
                   columns='bin_2',
                   values='count', 
                   aggfunc='sum',
                   fill_value=0,
                  )
Asc = ut.symmetrize(Asc)
Asc_plot1 = Asc.copy()
print(f"{Asc.shape=}")
# Asc = ut.normalize_oe(ut.normalize_kr(Asc).todense())
# Asc_plot1 = Asc.copy()

threshold = 0
Asc = np.where(Asc > threshold, 1, 0)
num_components, _ = connected_components(Asc, directed=False)
print(f"Input network thresholded at {threshold=} has {num_components} connected component(s)")

# visualize
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 6, 3
fig, axs = plt.subplots(1, 2, sharey=True)

axs[0].imshow(np.log1p(Asc_plot1))
axs[1].imshow(Asc)

axs[0].set_title('Unnormalized')
axs[1].set_title('Binary')

axs[0].set_xticks([])
axs[1].set_xticks([])

plt.tight_layout()


In [None]:
gsc = recon.create_graph_tools_from_adjacency(Asc)
hyperedges_sc = recon.learn_hyperedges_mcmc(gsc, niter=100)
print(f"{len(hyperedges_sc)=}")

reload(ut)
Isc = ut.nested_list_to_incidence_matrix(hyperedges_sc)
Isc = pd.DataFrame(Isc)
print(f"{Isc.shape=}")

node_params = {
    's' : 5,
    'ec' : 'k',
    'lw' : 1,
    'zorder' : 2,
}

line_params = {
    'lw' : 0.5,
    'alpha' : 0.5,
    'zorder' : 1,
    'ls' : '--',
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 10, 5
plt2.plot_incidence(ut.sort_by_lowest_index(Isc),
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3, 2
plt2.plot_incidence_order(Isc)

In [None]:
# outpath = "/nfs/turbo/umms-indikar/shared/projects/poreC/hypergraphs/sc_reconstruction.csv"
# Isc.to_csv(outpath)

In [None]:
break

In [None]:
break