In [1]:
import pandas as pd
import numpy as np
import os 
from math import comb
import itertools 

import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_seqkit_report(fpath):
    """A function to load a seqkit summary """
    pdf = pd.read_csv(fpath, sep=r"\s+")
    
    columns = [
        'num_seqs',
        'sum_len',
        'min_len',
        'avg_len',
        'max_len',
        'Q1',
        'Q2',
        'Q3',
        'N50',
    ] 
    
    for c in columns:
        pdf[c] = pdf[c].astype(str).str.replace(',', '').astype(float)

    return pdf

In [3]:
fpath = "/scratch/indikar_root/indikar1/shared_data/pore_c_population_minimal/reports/seqkit/fastq.report.txt"

pdf = load_seqkit_report(fpath)
pdf['Gb'] = pdf['sum_len'] / 1e+9

columns = [
    'file',
    'num_seqs',
    'Gb',
    'avg_len',
    'N50',
]

print(pdf[columns].round(1).astype(str).to_latex(index=False))

\begin{tabular}{lllll}
\toprule
file & num_seqs & Gb & avg_len & N50 \\
\midrule
batch01.raw.fastq & 1140937.0 & 1.8 & 1591.6 & 2159.0 \\
batch02.raw.fastq & 123442.0 & 0.2 & 1863.4 & 2453.0 \\
batch03.raw.fastq & 716000.0 & 1.1 & 1503.6 & 2068.0 \\
batch04.raw.fastq & 1212788.0 & 2.1 & 1758.7 & 2266.0 \\
\bottomrule
\end{tabular}



In [4]:
fpath = "/scratch/indikar_root/indikar1/shared_data/pore_c_population_minimal/align_table/batch02.GRCm39.align_table.parquet"

df = pd.read_parquet(fpath)
print(f"{df.shape=}")
print(f"{df['read_name'].nunique()=}")
df.head()

df.shape=(907344, 15)
df['read_name'].nunique()=123442


Unnamed: 0,read_name,align_id,read_start,read_end,length_on_read,chrom,ref_start,ref_end,fragment_id,fragment_start,fragment_end,fragment_length,monomer_duplicate,is_mapped,mapping_quality
0,00000b61-7794-4b29-9f89-2b74e7bbce3e,457681,0,8,8,,-1,,,,,,False,False,0
1,00000b61-7794-4b29-9f89-2b74e7bbce3e,457682,8,400,392,15.0,40952485,40952851.0,9795761.0,40952505.0,40952862.0,357.0,False,True,60
2,00000b61-7794-4b29-9f89-2b74e7bbce3e,457683,400,553,153,1.0,153345218,153345369.0,726712.0,153345224.0,153345375.0,151.0,False,True,60
3,00000b61-7794-4b29-9f89-2b74e7bbce3e,457684,553,611,58,,-1,,,,,,False,False,0
4,00000b61-7794-4b29-9f89-2b74e7bbce3e,457685,611,759,148,16.0,13896976,13897125.0,10167250.0,13896982.0,13897055.0,73.0,False,True,36


In [5]:
def get_contact_summary(df):
    """A function to summarize contact metrics for each read """
    res = []

    for read_name, group in df.groupby('read_name'):
        if not len(group) > 1:
            continue
            
        chroms = group['chrom'].values
        # get clique expanded pairs
        c_pairs = list(itertools.combinations(chroms, 2))
        exp_pairs = pd.DataFrame(c_pairs, columns=['c1', 'c2'])
        exp_pairs['is_cis'] = exp_pairs['c1'] == exp_pairs['c2']
        n_expanded_cis = exp_pairs['is_cis'].sum()
    
        # get direct pairs
        d_pairs = [(group['chrom'].iloc[i-1], group['chrom'].iloc[i]) for i in range(1, len(group))]
        adj_pairs = pd.DataFrame(d_pairs, columns=['c1', 'c2'])
        adj_pairs['is_cis'] = adj_pairs['c1'] == adj_pairs['c2']
        n_adj_cis = adj_pairs['is_cis'].sum()
    
        # compile metrics
        record = {
            'read_name' : read_name,
            'mean_mapq' : group['mapping_quality'].mean(),
            'min_mapq' : group['mapping_quality'].min(),
            'order' : len(group),
            'n_chroms' : group['chrom'].nunique(),
            'expanded_pairs' : len(exp_pairs),
            'expanded_cis' : n_expanded_cis,
            'direct_pairs' : len(adj_pairs),
            'direct_cis' : n_adj_cis,
        }
        res.append(record)

    res = pd.DataFrame(res)
    return res

In [None]:
sample_size = 100000

pdf = df.copy()

res = []

for t in np.linspace(0, 60, 13):
    pdf = pdf[pdf['mapping_quality'].notna()]
    pdf = pdf[pdf['mapping_quality'] >= t]
    pdf = pdf.sample(sample_size)
    
    # summarize contacts
    pdf = get_contact_summary(pdf)
    
    direct_cis = pdf['direct_cis'].sum() / pdf['direct_pairs'].sum()
    expanded_cis = pdf['expanded_cis'].sum() / pdf['expanded_pairs'].sum()
    
    row = {
        'mapping_quality_threshold' : t,
        'direct_cis' : direct_cis,
        'expanded_cis' : expanded_cis,
    }
    res.append(row)

res = pd.DataFrame(res)
res.head()

print(res.round(3).astype(str).to_latex(index=False))

# Direct Cis-trans

In [None]:
dpath = "/scratch/indikar_root/indikar1/shared_data/pore_c_population_minimal/direct/"

res = []

for f in os.listdir(dpath):
    if not f.endswith('json'):
        continue
    print(f)      
    fpath = f"{dpath}{f}"
    tmp = pd.read_json(fpath, keep_default_dates=True)
    tmp = tmp['cis_trans']
    tmp = tmp.loc[['cis', 'trans']].to_dict()
    tmp["batch"] = f.split(".")[0]
    
    res.append(tmp)


res = pd.DataFrame(res)
res = res.sort_values(by='batch')    
res = res.set_index('batch')
res['total'] = res.sum(axis=1)
res['cis(%)'] = res['cis'] / res['total']
res['trans(%)'] = res['trans'] / res['total']
res = res.reset_index()

print()
print(res.round(3).astype(str).to_latex(index=False))

In [None]:
dpath = "/scratch/indikar_root/indikar1/shared_data/pore_c_population_minimal/expanded/"

res = []

for f in os.listdir(dpath):
    if not f.endswith('json'):
        continue
    print(f)      
    fpath = f"{dpath}{f}"
    tmp = pd.read_json(fpath, keep_default_dates=True)
    tmp = tmp['cis_trans']
    tmp = tmp.loc[['cis', 'trans']].to_dict()
    tmp["batch"] = f.split(".")[0]
    
    res.append(tmp)

res = pd.DataFrame(res)
res = res.sort_values(by='batch')    
res = res.set_index('batch')
res['total'] = res.sum(axis=1)
res['cis(%)'] = res['cis'] / res['total']
res['trans(%)'] = res['trans'] / res['total']
res = res.reset_index()

print()
print(res.round(3).astype(str).to_latex(index=False))