In [1]:
import pandas as pd
import numpy as np
import os
import glob

# Raw contacts

In [2]:
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/reports/pairs/direct.basic.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['sample'] = df['basename'].str[:2]
df['cell'] = df['basename'].str[2:5]
df['cell_id'] = df['basename'].str[0:5]

totals = dict(zip(df['cell_id'].values, df['n_contacts'].values))
df.head()

df.shape=(380, 4)


Unnamed: 0,basename,n_contacts,n_cis,n_trans,sample,cell,cell_id
0,o4b03.GRCm39.direct.pairs,829,802,27,o4,b03,o4b03
1,o4b02.GRCm39.direct.pairs,3753,3270,483,o4,b02,o4b02
2,o4b08.GRCm39.direct.pairs,1563,1434,129,o4,b08,o4b08
3,o4b06.GRCm39.direct.pairs,8542,8394,148,o4,b06,o4b06
4,o4b01.GRCm39.direct.pairs,16692,16409,283,o4,b01,o4b01


In [3]:
gx = df.groupby('sample').agg(
    total_contacts = ('n_contacts', 'sum'),
    n_cis = ('n_cis', 'sum'),
    n_trans = ('n_trans', 'sum'),
).reset_index()

print(gx.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
sample & total_contacts & n_cis & n_trans \\
\midrule
o1 & 141026891 & 44850714 & 96176177 \\
o2 & 3599371 & 1297718 & 2301653 \\
o3 & 66293755 & 27482279 & 38811476 \\
o4 & 19874238 & 7723469 & 12150769 \\
\bottomrule
\end{tabular}



In [4]:
# break

# Filtering Report

In [5]:
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/reports/single_cell/direct.filter_report.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['sample'] = df['basename'].str[:2]
df['cell'] = df['basename'].str[2:5]
df['cell_id'] = df['basename'].str[0:5]
df['total_theoretical_contacts'] = df['cell_id'].map(totals)

df.tail()

df.shape=(380, 7)


Unnamed: 0,unmapped,adjacent,close_contact,duplicate,promiscuous,isolated,basename,sample,cell,cell_id,total_theoretical_contacts
375,133329,18693,27410,69307,2729,0,o3b25.GRCm39.direct.csv,o3,b25,o3b25,253797
376,176644,22331,33103,83748,6498,0,o1b14.GRCm39.direct.csv,o1,b14,o1b14,328013
377,1808636,125528,231933,721611,32782,0,o1b96.GRCm39.direct.csv,o1,b96,o1b96,2936739
378,1703425,156163,278092,736473,17883,0,o1b47.GRCm39.direct.csv,o1,b47,o1b47,2901662
379,1852544,314738,419803,852728,42212,0,o1b25.GRCm39.direct.csv,o1,b25,o1b25,3496816


In [6]:
gx = df.groupby('sample').agg(
    total_theoretical_contacts = ('total_theoretical_contacts', 'sum'),
    unmapped = ('unmapped', 'sum'),
    adjacent = ('adjacent', 'sum'),
    close_contact = ('close_contact', 'sum'),
    duplicate = ('duplicate', 'sum'),
    promiscuous = ('promiscuous', 'sum'),
    isolated = ('isolated', 'sum'),
).reset_index()

gx['total_contacts'] = gx['total_theoretical_contacts'] - gx['unmapped']
gx['passing'] = gx['total_contacts'] - gx[['adjacent','close_contact','duplicate','promiscuous', 'isolated']].sum(axis=1)

columns = [
    'sample',
    'total_contacts',
    'passing',
    'adjacent',
    'close_contact',
    'duplicate',
    'promiscuous',
    'isolated'
]

print(gx[columns].astype(str).to_latex(index=False))

\begin{tabular}{llllllll}
\toprule
sample & total_contacts & passing & adjacent & close_contact & duplicate & promiscuous & isolated \\
\midrule
o1 & 57920500 & 751788 & 7561659 & 12487851 & 35814599 & 1304603 & 0 \\
o2 & 1512758 & 138557 & 223164 & 336376 & 769559 & 45102 & 0 \\
o3 & 32254285 & 553936 & 5839638 & 7686003 & 17191949 & 982759 & 0 \\
o4 & 9149108 & 579356 & 1675444 & 2778829 & 3832598 & 282881 & 0 \\
\bottomrule
\end{tabular}



# Filtered Contacts

In [7]:
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/reports/single_cell/pairs/direct.basic.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['sample'] = df['basename'].str[:2]
df['cell'] = df['basename'].str[2:5]
df['cell_id'] = df['basename'].str[0:5]

df.head()

df.shape=(380, 4)


Unnamed: 0,basename,n_contacts,n_cis,n_trans,sample,cell,cell_id
0,o4b03.GRCm39.filtered.pairs,8,0,8,o4,b03,o4b03
1,o4b02.GRCm39.filtered.pairs,151,61,90,o4,b02,o4b02
2,o4b08.GRCm39.filtered.pairs,36,13,23,o4,b08,o4b08
3,o4b06.GRCm39.filtered.pairs,60,15,45,o4,b06,o4b06
4,o4b01.GRCm39.filtered.pairs,118,50,68,o4,b01,o4b01


In [8]:
gx = df.groupby('sample').agg(
    total_contacts = ('n_contacts', 'sum'),
    n_cis = ('n_cis', 'sum'),
    n_trans = ('n_trans', 'sum'),
).reset_index()

gx['cis_percent'] = (gx['n_cis'] / gx['total_contacts']) * 100

print(gx.round(2).astype(str).to_latex(index=False))

\begin{tabular}{lllll}
\toprule
sample & total_contacts & n_cis & n_trans & cis_percent \\
\midrule
o1 & 751788 & 118275 & 633513 & 15.73 \\
o2 & 138557 & 19288 & 119269 & 13.92 \\
o3 & 553936 & 117636 & 436300 & 21.24 \\
o4 & 579356 & 172557 & 406799 & 29.78 \\
\bottomrule
\end{tabular}



# Number of cells passing:

In [9]:
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/reports/single_cell/pairs/direct.basic.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['sample'] = df['basename'].str[:2]
df['cell'] = df['basename'].str[2:5]
df['cell_id'] = df['basename'].str[0:5]
df['trans_ratio'] = df['n_trans'] / df['n_contacts']
df['passing_trans'] = (df['trans_ratio'] < 0.3)
df['passing_contacts'] = (df['n_contacts'] > 10000)
df['passing_both'] =  df['passing_trans'] & df['passing_contacts']

df.head()


df.shape=(380, 4)


Unnamed: 0,basename,n_contacts,n_cis,n_trans,sample,cell,cell_id,trans_ratio,passing_trans,passing_contacts,passing_both
0,o4b03.GRCm39.filtered.pairs,8,0,8,o4,b03,o4b03,1.0,False,False,False
1,o4b02.GRCm39.filtered.pairs,151,61,90,o4,b02,o4b02,0.596026,False,False,False
2,o4b08.GRCm39.filtered.pairs,36,13,23,o4,b08,o4b08,0.638889,False,False,False
3,o4b06.GRCm39.filtered.pairs,60,15,45,o4,b06,o4b06,0.75,False,False,False
4,o4b01.GRCm39.filtered.pairs,118,50,68,o4,b01,o4b01,0.576271,False,False,False


In [10]:
df['passing_trans'].value_counts()

passing_trans
False    380
Name: count, dtype: int64

In [11]:
gx = df.groupby('sample').agg(
    passing_contacts = ('passing_contacts', 'sum'),
    passing_trans = ('passing_trans', 'sum'),
    passing_both = ('passing_both', 'sum'),
).reset_index()

print(gx.astype(str).to_latex(index=False))

\begin{tabular}{llll}
\toprule
sample & passing_contacts & passing_trans & passing_both \\
\midrule
o1 & 21 & 0 & 0 \\
o2 & 0 & 0 & 0 \\
o3 & 7 & 0 & 0 \\
o4 & 15 & 0 & 0 \\
\bottomrule
\end{tabular}



# For expanded contacts

In [12]:
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/reports/pairs/expanded.basic.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['sample'] = df['basename'].str[:2]
df['cell'] = df['basename'].str[2:5]
df['cell_id'] = df['basename'].str[0:5]

totals = dict(zip(df['cell_id'].values, df['n_contacts'].values))
df.head()

df.shape=(380, 4)


Unnamed: 0,basename,n_contacts,n_cis,n_trans,sample,cell,cell_id
0,o4b03.GRCm39.expanded.pairs,148,76,72,o4,b03,o4b03
1,o4b02.GRCm39.expanded.pairs,1709,690,1019,o4,b02,o4b02
2,o4b08.GRCm39.expanded.pairs,508,164,344,o4,b08,o4b08
3,o4b06.GRCm39.expanded.pairs,1795,1195,600,o4,b06,o4b06
4,o4b01.GRCm39.expanded.pairs,1710,687,1023,o4,b01,o4b01


In [13]:
fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/reports/single_cell/expanded.filter_report.csv"

df = pd.read_csv(fpath)
print(f"{df.shape=}")
df['sample'] = df['basename'].str[:2]
df['cell'] = df['basename'].str[2:5]
df['cell_id'] = df['basename'].str[0:5]
df['total_theoretical_contacts'] = df['cell_id'].map(totals)

df.head()

df.shape=(380, 7)


Unnamed: 0,unmapped,adjacent,close_contact,duplicate,promiscuous,isolated,basename,sample,cell,cell_id,total_theoretical_contacts
0,99,28,5,0,0,0,o4b03.GRCm39.expanded.csv,o4,b03,o4b03,148
1,973,478,69,1,0,0,o4b02.GRCm39.expanded.csv,o4,b02,o4b02,1709
2,311,133,22,0,0,0,o4b08.GRCm39.expanded.csv,o4,b08,o4b08,508
3,1426,252,28,0,0,0,o4b06.GRCm39.expanded.csv,o4,b06,o4b06,1795
4,971,499,56,0,0,0,o4b01.GRCm39.expanded.csv,o4,b01,o4b01,1710


In [14]:
825 - (817)

8

In [15]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
gx = df.groupby('sample').agg(
    total_theoretical_contacts = ('total_theoretical_contacts', 'sum'),
    unmapped = ('unmapped', 'sum'),
    adjacent = ('adjacent', 'sum'),
    close_contact = ('close_contact', 'sum'),
    duplicate = ('duplicate', 'sum'),
    promiscuous = ('promiscuous', 'sum'),
    isolated = ('isolated', 'sum'),
).reset_index()

gx['total_contacts'] = gx['total_theoretical_contacts'] - gx['unmapped']
gx['passing'] = gx['total_contacts'] - gx[['adjacent','close_contact','duplicate','promiscuous', 'isolated']].sum(axis=1)

columns = [
    'sample',
    'total_contacts',
    'passing',
    'adjacent',
    'close_contact',
    'duplicate',
    'promiscuous',
    'isolated'
]

print(gx[columns].astype(str).to_latex(index=False))

In [None]:
break

In [None]:
dpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/reports/single_cell/"
file_list = glob.glob(f"{dpath}*filter_report*")

for f in file_list:
    tmp = pd.read_csv(f)
    print(f)
    print(tmp.head(1).T)
    print()

In [None]:
align_path = "/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/o4b03.GRCm39.align_table.parquet"

pdf = pd.read_parquet(align_path)
print(f"{pdf.shape=}")
print(f"{pdf['read_name'].nunique()=}")
pdf.head()

# Archive

In [None]:
import pairtools
import pairtools.lib.headerops as phead

def load_pairs(fpath, comment_char="#"):
    """Loads a pairs table from a tab-separated file.

    Args:
        fpath (str): The file path to the pairs table.
        comment_char (str, optional): The character indicating comment lines. 
                                     Defaults to "#".

    Returns:
        pd.DataFrame: A Pandas DataFrame containing the pairs table data.
    """
    header = phead.get_header(open(fpath))[0][-1]
    header = header.replace("#columns: ", "")
    header = header.split(" ")
    
    df = pd.read_csv(fpath, 
                     sep='\t', 
                     header=None, 
                     names=header, 
                     comment="#")
    return df


fpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/direct/pairs/o3b25.GRCm39.direct.pairs"
tdf = load_pairs(fpath)
print(f"(unfiltered) {tdf.shape=}")
tdf = tdf[(tdf['chrom1'] != "!") & (tdf['chrom2'] != "!")].copy()
print(f"(unfiltered) {tdf.shape=}")
tdf.head()

In [None]:
58891 - 14194