In [1]:
import pandas as pd
import pyranges as pr

In [2]:
""" LOAD Pore-C """
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/pore_c/population_mESC.read_level.parquet"

columns = [
    'read_name', 
    'chrom',
    'ref_start',
    'ref_end',
    'mapping_quality',
]

df = pd.read_parquet(fpath, columns=columns)
print(f"{df['read_name'].nunique()=}")
df['monomer_id'] = df.groupby('read_name').cumcount() + 1
df = df.rename(columns={
    'chrom' : 'Chromosome',
    'ref_start' : 'Start',
    'ref_end' : 'End',
})
print(f"{df.shape=}")
df.head()

df['read_name'].nunique()=2756467
df.shape=(14877807, 6)


Unnamed: 0,read_name,Chromosome,Start,End,mapping_quality,monomer_id
0,00000b61-7794-4b29-9f89-2b74e7bbce3e,15,40952485,40952851,60,1
1,00000b61-7794-4b29-9f89-2b74e7bbce3e,1,153345218,153345369,60,2
2,00000b61-7794-4b29-9f89-2b74e7bbce3e,16,13896976,13897125,36,3
3,00000b61-7794-4b29-9f89-2b74e7bbce3e,4,99871359,99871640,60,4
4,000047b3-2703-4687-9978-37722f5619da,4,31146588,31147172,60,1


In [20]:
""" LOAD genes  """
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/gene_table.parquet"
gdf = pd.read_parquet(fpath)
# gdf = gdf[gdf['gene_biotype'] == 'protein_coding']
gdf = gdf[gdf['Chromosome'].isin(df['Chromosome'].unique())].reset_index(drop=True)
print(f"{gdf.shape=}")
gdf.head()

gdf.shape=(55042, 10)


Unnamed: 0,gene_id,gene_name,gene_source,gene_biotype,Chromosome,Start,End,length,midpoint,is_tf
0,ENSMUSG00000104478,Gm38212,havana,TEC,1,108344806,108347562,2756,108346184,False
1,ENSMUSG00000104385,Gm7449,havana,processed_pseudogene,1,6980783,6981446,663,6981114,False
2,ENSMUSG00000102135,Gm37108,havana,processed_pseudogene,1,6986782,6993812,7030,6990297,False
3,ENSMUSG00000103282,Gm37275,havana,processed_pseudogene,1,6999982,7000012,30,6999997,False
4,ENSMUSG00000101097,Gm6679,havana,processed_pseudogene,1,108697864,108699733,1869,108698798,False


In [28]:
# Use PyRanges for efficient interval joining
gdf_pr = pr.PyRanges(gdf)
df_pr = pr.PyRanges(df)

# Join dataframes, keeping all original intervals
pdf = df_pr.join(
    gdf_pr,
    strandedness=None,
    how='left',
    report_overlap=True,
).df.rename(columns={
    'Chromosome': 'chrom',
    'Start': 'ref_start',
    'End': 'ref_end',
    'Start_b': 'gene_start',
    'End_b': 'gene_end',
    'length': 'gene_length',
    'Overlap': 'gene_overlap',
})

# Select the best overlap for each interval
pdf = pdf.sort_values(by='gene_overlap', ascending=False)
pdf = pdf.drop_duplicates(subset=['read_name', 'ref_start', 'ref_end'], keep='first')
print(f"{pdf.shape=}")
pdf.head()

pdf.shape=(14810874, 16)


Unnamed: 0,read_name,chrom,ref_start,ref_end,mapping_quality,monomer_id,gene_id,gene_name,gene_source,gene_biotype,gene_start,gene_end,gene_length,midpoint,is_tf,gene_overlap
8342354,3891ee6d-53d1-4ee0-ba2f-3d22291d4493,9,121048825,121057172,60,1,ENSMUSG00000040936,Ulk4,ensembl_havana,protein_coding,120784416,121106263,321847,120945339,False,8347
14789315,66953ddf-e76d-4cdf-aaf8-be028a2d7b04,19,26583825,26592064,60,21,ENSMUSG00000024921,Smarca2,ensembl_havana,protein_coding,26582449,26755722,173273,26669085,False,8239
4110544,ad5b2240-893f-4ed0-a157-c2be66d8d754,4,127067225,127074760,60,1,ENSMUSG00000042388,Dlgap3,ensembl_havana,protein_coding,127062996,127130815,67819,127096905,False,7535
10733063,3f354c45-5e48-4f6d-8c7e-05369432b344,12,8680599,8685932,60,4,ENSMUSG00000120669,Gm56531,havana_tagene,lncRNA,8649709,8691034,41325,8670371,False,5333
13889148,d4626feb-16a2-4aac-8145-53e89b60bf7c,17,66243365,66248641,60,1,ENSMUSG00000024098,Twsg1,ensembl_havana,protein_coding,66228966,66258221,29255,66243593,False,5276


In [25]:
pdf['gene_biotype'].value_counts(normalize=True)

gene_biotype
-1                                    5.438883e-01
protein_coding                        3.812595e-01
lncRNA                                6.441490e-02
unprocessed_pseudogene                3.723616e-03
processed_pseudogene                  2.985037e-03
TEC                                   1.622457e-03
transcribed_unprocessed_pseudogene    8.482281e-04
unitary_pseudogene                    2.673711e-04
transcribed_unitary_pseudogene        2.294260e-04
snRNA                                 1.149156e-04
rRNA                                  1.078262e-04
transcribed_processed_pseudogene      1.048554e-04
snoRNA                                9.202698e-05
miRNA                                 8.324964e-05
IG_V_gene                             7.183911e-05
misc_RNA                              4.672243e-05
IG_V_pseudogene                       4.159106e-05
TR_V_gene                             2.856010e-05
IG_C_gene                             2.113312e-05
TR_C_gene         

In [27]:
print(3.812595e-01)

0.3812595


In [19]:
print(6.117925e-01)

0.6117925


In [10]:
pdf['read_name'].nunique()

2756467

In [11]:
df['read_name'].nunique()

2756467