In [1]:
import pandas as pd
import numpy as np
import os 
import pyranges as pr
from scipy.sparse import csr_matrix

In [2]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/reference_db.parquet"

gdf = pd.read_parquet(fpath)
print(f"{gdf.shape=}")
gdf = gdf.fillna(0.0)
gdf['fragment_id'] = (gdf['fragment_id'] + 1).astype(int)
gdf.head()

gdf.shape=(12901276, 18)


Unnamed: 0,Chromosome,Start,End,fragment_id,fragment_length,ATACSeq_1,ATACSeq_2,ATACSeq_3,CTCF,H3K27ac,H3K27me3,RNA_1,RNA_2,RNA_3,RNA_4,RNA_5,RNA_6,PolII
0,1,0.0,3050051.0,1,3050051.0,0.057623,0.017762,0.018069,0.009718,0.024774,0.015503,0.239711,0.0,0.13278,0.271721,0.0,0.104429,0.008725
1,1,3050051.0,3050056.0,2,5.0,0.19399,0.0,0.3143,0.45092,0.0,0.34725,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,3050056.0,3050844.0,3,788.0,0.242762,0.0,0.128138,0.127551,0.037367,0.282251,0.0,0.0,0.0,0.0,0.0,0.0,0.001214
3,1,3050844.0,3050907.0,4,63.0,0.597246,0.0,0.4049,0.220927,0.0,0.267332,0.0,0.0,0.0,0.0,0.0,0.0,0.19136
4,1,3050907.0,3050947.0,5,40.0,0.6934,0.0,0.315609,0.165337,0.0,0.169318,0.0,0.0,0.0,0.0,0.0,0.0,0.19136


In [3]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/pore_c/population_mESC.read_level.parquet"

df = pd.read_parquet(fpath)
print(f"{df.shape=}")
df = df.rename(columns={
    'chrom' : 'Chromosome',
    'ref_start' : 'Start',
    'ref_end' : 'End',
})

df['align_id'] = df.groupby('read_name').cumcount() + 1

df.head()

df.shape=(14877807, 12)


Unnamed: 0,read_name,read_start,read_end,length_on_read,Chromosome,Start,End,mapping_quality,basename,local_position,global_position,order,align_id
0,00000b61-7794-4b29-9f89-2b74e7bbce3e,8,400,392,15,40952485,40952851,60,batch02,40952668,2053916568,4,1
1,00000b61-7794-4b29-9f89-2b74e7bbce3e,400,553,153,1,153345218,153345369,60,batch02,153345293,153345293,4,2
2,00000b61-7794-4b29-9f89-2b74e7bbce3e,611,759,148,16,13896976,13897125,36,batch02,13897050,2130934901,4,3
3,00000b61-7794-4b29-9f89-2b74e7bbce3e,810,1110,300,4,99871359,99871640,60,batch02,99871499,636526111,4,4
4,000047b3-2703-4687-9978-37722f5619da,0,1025,1025,4,31146588,31147172,60,batch02,31146880,567801492,4,1


In [4]:
# Convert to PyRanges objects
pr_df = pr.PyRanges(df)
pr_gdf = pr.PyRanges(gdf[['Chromosome', 'Start', 'End', 'fragment_id']])

# Perform the join. 
merged_pr = pr_df.join(
    pr_gdf, 
    how="left",
    suffix="_frag",
    report_overlap=True,
    preserve_order=True,
) # left join keeps all rows from df

# Convert back to pandas DataFrame
merged_df = merged_pr.df
print(f"{df.shape=}")
print(f"{gdf.shape=}")
print(f"{merged_df.shape=}")

# Select the best overlap for each interval
merged_df = merged_df.sort_values(by=['read_name', 'align_id', 'Overlap'], ascending=[True, True, False])
merged_df = merged_df.drop_duplicates(subset=['read_name', 'align_id',], keep='first')
print(f"{merged_df.shape=}")

# drop singletons
merged_df['order'] = merged_df.groupby('read_name')['fragment_id'].transform('nunique')
merged_df = merged_df[merged_df['order'] > 1]

print(f"{merged_df.shape=}")
merged_df.head()

df.shape=(14877807, 13)
gdf.shape=(12901276, 18)
merged_df.shape=(27749704, 17)
merged_df.shape=(14877807, 17)
merged_df.shape=(14787541, 17)


Unnamed: 0,read_name,read_start,read_end,length_on_read,Chromosome,Start,End,mapping_quality,basename,local_position,global_position,order,align_id,Start_frag,End_frag,fragment_id,Overlap
17251560,00000202-49cf-47b2-83bf-5eb3f6d98373,0,460,460,10,79553913,79554361,60,batch04,79554137,1473898218,3,1,79553685,79554367,7040121,448
17251562,00000202-49cf-47b2-83bf-5eb3f6d98373,460,687,227,10,79553679,79553895,35,batch04,79553787,1473897868,3,2,79553685,79554367,7040121,210
17251564,00000202-49cf-47b2-83bf-5eb3f6d98373,687,853,166,10,79553496,79553677,26,batch04,79553586,1473897667,3,3,79553502,79553685,7040120,175
17251565,00000202-49cf-47b2-83bf-5eb3f6d98373,853,1490,637,10,79552809,79553496,60,batch04,79553152,1473897233,3,4,79552815,79553502,7040119,681
22838609,00000b61-7794-4b29-9f89-2b74e7bbce3e,8,400,392,15,40952485,40952851,60,batch02,40952668,2053916568,4,1,40952505,40952862,9795762,346


In [5]:
def make_incidence(df):
    df = df.sort_values(by='fragment_id')
    df['value'] = 1
    df['r_index'] = df['read_name'].astype('category').cat.codes
    df['f_index'] = df['fragment_id'].astype('category').cat.codes
    
    data = df['value'].tolist()
    row = df['f_index'].values
    col = df['r_index'].values
    
    n = df['f_index'].nunique()
    m = df['r_index'].nunique()
    
    index = df['fragment_id'].unique()
    columns = df['read_name'].unique()
    
    X = csr_matrix((data, (row, col)), shape=(n, m))
    X = csr_matrix((X > 0).astype(int))
    H = pd.DataFrame.sparse.from_spmatrix(
        X, index=index, columns=columns,
    )
    del X
    del data
    del row
    del col
    del index
    del columns
    return H


H = make_incidence(merged_df)
print(f"{H.shape=}")
print(f"{type(H)=}")
print(f"DataFrame size: {H.memory_usage().sum() / 1024**2:.2f} MB")

outpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/restriction/incidence.pickle"
H.to_pickle(outpath)
H.head()

H.shape=(6306765, 2714221)
type(H)=<class 'pandas.core.frame.DataFrame'>
DataFrame size: 206.38 MB


Unnamed: 0,52aab340-35f0-4a9a-b718-b3381a32c138,7bc15260-783f-43bc-9ad5-b5d5a94ffcb0,f39396b9-248f-45e5-a79c-19d6713eb5be,1ed5c9f0-d723-4d72-b415-a4079e734e94,3e85c22a-fafc-4b99-be8b-30519965d58f,24141009-2447-4d29-aa41-ea307d3f3e6d,c6ec36d1-8076-4273-b50b-ae1e9d0041cf,e5dc02d8-a311-4756-b9ab-91ecce23129b,caa721fc-e369-4245-8e48-f6acab03bb27,1b39bb35-63cd-4ea8-a9dc-d3616c323ac9,...,1fb0f26c-bc67-4c4e-a1d6-6b1b0d0794cf,0a281381-40e4-4fa4-8885-820a02542e63,d41a6081-ffd4-4a3e-b738-53b6a70c9840,27a935e1-8b0d-4c7e-9ef9-95500b0172eb,73c78dcf-18da-4132-ac30-59c2457065ea,af8dc02b-bfd5-4be9-b7ad-c7c53796c025,1487ed78-5070-411b-bd43-b13313de063f,58e75c47-ff7a-4718-bac8-f74347e08fb0,670c3edc-eed7-4942-a61d-2d4956309667,c4f13e9d-31cb-4a07-beab-00d4ba393c33
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)