In [1]:
import pandas as pd
import os
import sys
import numpy as np
import glob 

# Mapping summary

In [2]:
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/reports/monomer_mapping/mapping_summary.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['sample'] = df['basename'].str[:2]
df['cell'] = df['basename'].str[2:5]
df.head()

df.shape=(380, 6)


Unnamed: 0,total_reads,reads_with_1_mapped,reads_with_3+_mapped,reads_with_2_mapped,reads_with_0_mapped,basename,sample,cell
0,785,768,15,2,0,o4b03.GRCm39.align_table.parquet,o4,b03
1,3314,2989,163,162,0,o4b02.GRCm39.align_table.parquet,o4,b02
2,1449,1350,54,45,0,o4b08.GRCm39.align_table.parquet,o4,b08
3,8295,8237,52,6,0,o4b06.GRCm39.align_table.parquet,o4,b06
4,16284,16151,124,9,0,o4b01.GRCm39.align_table.parquet,o4,b01


In [3]:
gx = df.groupby('sample').agg(
    total_reads = ('total_reads', 'sum'),
    unmapped = ('reads_with_0_mapped', 'sum'),
    singletons = ('reads_with_1_mapped', 'sum'),
    pairwise = ('reads_with_2_mapped', 'sum'),
    higher_order = ('reads_with_3+_mapped', 'sum'),
).reset_index()

totals = dict(zip(gx['sample'].values, gx['total_reads'].values))

print(gx.to_latex(index=False))

\begin{tabular}{lrrrrr}
\toprule
sample & total_reads & unmapped & singletons & pairwise & higher_order \\
\midrule
o1 & 23170059 & 0 & 300304 & 2177100 & 20692655 \\
o2 & 639888 & 0 & 200137 & 7928 & 431823 \\
o3 & 8002384 & 0 & 813247 & 241916 & 6947221 \\
o4 & 6039222 & 0 & 276841 & 892921 & 4869460 \\
\bottomrule
\end{tabular}



In [4]:
gx = df.groupby('sample').agg(
    total_reads = ('total_reads', 'sum'),
    unmapped = ('reads_with_0_mapped', 'sum'),
    singletons = ('reads_with_1_mapped', 'sum'),
    pairwise = ('reads_with_2_mapped', 'sum'),
    higher_order = ('reads_with_3+_mapped', 'sum'),
).reset_index()

gx['singletons'] = (gx['singletons'] / gx['total_reads']) * 100
gx['pairwise'] = (gx['pairwise'] / gx['total_reads']) * 100
gx['higher_order'] = (gx['higher_order'] / gx['total_reads']) * 100

columns = ['sample', 'singletons', 'pairwise', 'higher_order']
print(gx[columns].round(2).astype(str).to_latex(index=False))

\begin{tabular}{llll}
\toprule
sample & singletons & pairwise & higher_order \\
\midrule
o1 & 1.3 & 9.4 & 89.31 \\
o2 & 31.28 & 1.24 & 67.48 \\
o3 & 10.16 & 3.02 & 86.81 \\
o4 & 4.58 & 14.79 & 80.63 \\
\bottomrule
\end{tabular}



In [5]:
def get_mapped_read_counts(df):
    """Calculates the count of reads bucketed by the number of mapped monomers.

    Args:
        df (pandas.DataFrame): The input DataFrame containing 'read_name' and 'is_mapped' columns.

    Returns:
        dict: A dictionary where keys are like 'reads_with_X_mapped' (X = 0, 1, 2, 3+)
              and values are the corresponding counts.
    """

    # Group by read name, sum is_mapped
    monomers_mapped = df.groupby('read_name')['is_mapped'].sum()

    # Create bins and labels (same as your code)
    bins = [-np.inf, 0, 1, 2, np.inf]
    labels = ['0', '1', '2', '3+']

    # Bucket the values, count occurrences, and reset index
    monomers_mapped = pd.cut(monomers_mapped, bins=bins, labels=labels, right=False)
    monomers_mapped = monomers_mapped.value_counts().reset_index()

    # Format the 'index' column to match your desired output
    monomers_mapped['is_mapped'] = "reads_with_" + monomers_mapped['is_mapped'].astype(str) + "_mapped"

    # Convert to dictionary (same as your code)
    return dict(zip(monomers_mapped['is_mapped'].values, monomers_mapped['count'].values))


def get_summary_row(df):
    """A function to get key summary mertics from an alignment table """
    results = {}
    results['total_reads'] = df['read_name'].nunique()
    monomers_mapped = get_mapped_read_counts(df)
    
    results = results | monomers_mapped
    
    return results

In [15]:
dpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/"
file_list = glob.glob(f"{dpath}*")

mapq = 55

res = []

for file_path in file_list:
    basename = os.path.basename(file_path)

    df = pd.read_parquet(file_path)
    df = df[df['mapping_quality'] > mapq]
    row = get_summary_row(df)
    row['basename'] = basename
    res.append(row)

res = pd.DataFrame(res)
res['sample'] = res['basename'].str[:2]
res['cell'] = res['basename'].str[2:5]
res.head()

Unnamed: 0,total_reads,reads_with_3+_mapped,reads_with_2_mapped,reads_with_1_mapped,reads_with_0_mapped,basename,sample,cell
0,557735,427181,130554,0,0,o1b18.GRCm39.align_table.parquet,o1,b18
1,57,51,6,0,0,o2b03.GRCm39.align_table.parquet,o2,b03
2,14565,10185,4380,0,0,o4b04.GRCm39.align_table.parquet,o4,b04
3,190,152,38,0,0,o4b79.GRCm39.align_table.parquet,o4,b79
4,19560,17332,2228,0,0,o3b86.GRCm39.align_table.parquet,o3,b86


In [16]:
gx = res.groupby('sample').agg(
    passing = ('total_reads', 'sum'),
    singletons = ('reads_with_1_mapped', 'sum'),
    pairwise = ('reads_with_2_mapped', 'sum'),
    higher_order = ('reads_with_3+_mapped', 'sum'),
).reset_index()

gx['total'] = gx['sample'].map(totals)

gx['passing'] = (gx['passing'] / gx['total']) * 100
gx['singletons'] = (gx['singletons'] / gx['total']) * 100
gx['pairwise'] = (gx['pairwise'] / gx['total']) * 100
gx['higher_order'] = (gx['higher_order'] / gx['total']) * 100

columns = [
    'sample',
    'total',
    'passing',
    'singletons',
    'pairwise',
    'higher_order',
]

print(gx[columns].round(2).astype(str).to_latex(index=False))

# print(gx.to_latex(index=False))

\begin{tabular}{llllll}
\toprule
sample & total & passing & singletons & pairwise & higher_order \\
\midrule
o1 & 23170059 & 92.07 & 0.0 & 18.69 & 73.38 \\
o2 & 639888 & 66.95 & 0.0 & 5.8 & 61.15 \\
o3 & 8002384 & 86.67 & 0.0 & 7.96 & 78.71 \\
o4 & 6039222 & 88.41 & 0.0 & 24.25 & 64.16 \\
\bottomrule
\end{tabular}

