In [29]:
import pandas as pd
import sys
import os
import numpy as np
import glob

In [2]:
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o1b01.GRCm39.align_table.parquet"

df = pd.read_parquet(fpath)
print(f"{df.shape=}")
df.head()

df.shape=(1197, 15)


Unnamed: 0,read_name,align_id,read_start,read_end,length_on_read,chrom,ref_start,ref_end,fragment_id,fragment_start,fragment_end,fragment_length,monomer_duplicate,is_mapped,mapping_quality
0,0158e151-11aa-4e3e-a6dd-a5569c9d3395,837,0,20,20,,-1,,,,,,False,False,0
1,0158e151-11aa-4e3e-a6dd-a5569c9d3395,838,20,46,26,,-1,,,,,,False,False,0
2,0158e151-11aa-4e3e-a6dd-a5569c9d3395,839,46,94,48,9.0,63500621,63500669.0,6364455.0,63500627.0,63500675.0,48.0,False,True,1
3,0158e151-11aa-4e3e-a6dd-a5569c9d3395,840,94,98,4,,-1,,,,,,False,False,0
4,0158e151-11aa-4e3e-a6dd-a5569c9d3395,841,98,204,106,1.0,77598773,77598878.0,356922.0,77598775.0,77598880.0,105.0,False,True,60


In [33]:
def get_mapped_read_counts(df):
    """Calculates the count of reads bucketed by the number of mapped monomers.

    Args:
        df (pandas.DataFrame): The input DataFrame containing 'read_name' and 'is_mapped' columns.

    Returns:
        dict: A dictionary where keys are like 'reads_with_X_mapped' (X = 0, 1, 2, 3+)
              and values are the corresponding counts.
    """

    # Group by read name, sum is_mapped
    monomers_mapped = df.groupby('read_name')['is_mapped'].sum()

    # Create bins and labels (same as your code)
    bins = [-np.inf, 0, 1, 2, np.inf]
    labels = ['0', '1', '2', '3+']

    # Bucket the values, count occurrences, and reset index
    monomers_mapped = pd.cut(monomers_mapped, bins=bins, labels=labels, right=False)
    monomers_mapped = monomers_mapped.value_counts().reset_index()

    # Format the 'index' column to match your desired output
    monomers_mapped['is_mapped'] = "reads_with_" + monomers_mapped['is_mapped'].astype(str) + "_mapped"

    # Convert to dictionary (same as your code)
    return dict(zip(monomers_mapped['is_mapped'].values, monomers_mapped['count'].values))


def get_summary_row(df):
    """A function to get key summary mertics from an alignment table """
    results = {}
    results['total_reads'] = df['read_name'].nunique()
    monomers_mapped = get_mapped_read_counts(df)
    
    results = results | monomers_mapped
    
    return results
    
    
get_summary_row(df)

{'total_reads': 604302,
 'reads_with_3+_mapped': 532871,
 'reads_with_2_mapped': 65438,
 'reads_with_1_mapped': 5993,
 'reads_with_0_mapped': 0}

In [30]:
dpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/"
file_list = glob.glob(f"{dpath}*")
file_list[:10]

['/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o1b18.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o2b03.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o4b04.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o4b79.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o3b86.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o1b15.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o1b89.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o1b26.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o4b07.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/single_cell/align_t

In [36]:
res = []

for file_path in file_list[:50]:
    basename = os.path.basename(file_path)
    
    df = pd.read_parquet(file_path)
    row = get_summary_row(df)
    row['basename'] = basename
    res.append(row)
    
res = pd.DataFrame(res)
res.head()


Unnamed: 0,total_reads,reads_with_3+_mapped,reads_with_2_mapped,reads_with_1_mapped,reads_with_0_mapped,basename
0,604302,532871,65438,5993,0,o1b18.GRCm39.align_table.parquet
1,1855,57,2,1796,0,o2b03.GRCm39.align_table.parquet
2,22499,12862,3033,6604,0,o4b04.GRCm39.align_table.parquet
3,7596,181,20,7395,0,o4b79.GRCm39.align_table.parquet
4,21609,20062,313,1234,0,o3b86.GRCm39.align_table.parquet
