In [1]:
import pandas as pd
import os 
import numpy as np
from pyranges import PyRanges
import pairtools
import pairtools.lib.headerops as phead

In [4]:
def load_pairs(fpath, comment_char="#"):
    """Loads a pairs table from a tab-separated file.

    Args:
        fpath (str): The file path to the pairs table.
        comment_char (str, optional): The character indicating comment lines. 
                                     Defaults to "#".

    Returns:
        pd.DataFrame: A Pandas DataFrame containing the pairs table data.
    """
    header = phead.get_header(open(fpath))[0][-1]
    header = header.replace("#columns: ", "")
    header = header.split(" ")
    
    df = pd.read_csv(fpath, 
                     sep='\t', 
                     header=None, 
                     names=header, 
                     comment="#")
    return df


fpath = '/scratch/indikar_root/indikar1/shared_data/single_cell/expanded/pairs/o1b05.GRCm39.expanded.pairs'
df = load_pairs(fpath)
print(f"{df.shape=}")
df.head()

df.shape=(1353139, 24)


Unnamed: 0,readID,chrom1,pos1,chrom2,pos2,strand1,strand2,pair_type,mapq1,mapq2,...,algn_ref_span1,algn_ref_span2,matched_bp1,matched_bp2,rfrag1,rfrag_start1,rfrag_end1,rfrag2,rfrag_start2,rfrag_end2
0,c984758d-b12e-44a0-86fa-f5bb326837f1,!,0,4,42850159,-,+,NU,0,1,...,0,243,0,242,-1,0,0,183807,42849815,42850205
1,6b11c966-4911-4640-a640-c3b651524b5b,!,0,1,160219378,-,+,NU,0,60,...,0,590,0,589,-1,0,0,760125,160218791,160219386
2,6b11c966-4911-4640-a640-c3b651524b5b,!,0,1,160218505,-,-,NU,0,60,...,0,278,0,276,-1,0,0,760123,160218281,160218511
3,6b11c966-4911-4640-a640-c3b651524b5b,1,160218789,1,160218505,+,-,UU,60,60,...,590,278,589,276,760124,160218511,160218791,760123,160218281,160218511
4,8f835bd9-c2ce-4dbd-8784-11672532138c,!,0,11,72720934,-,+,NU,0,21,...,0,61,0,61,-1,0,0,345258,72720876,72720976


In [5]:
df.columns

Index(['readID', 'chrom1', 'pos1', 'chrom2', 'pos2', 'strand1', 'strand2',
       'pair_type', 'mapq1', 'mapq2', 'read_len1', 'read_len2',
       'algn_read_span1', 'algn_read_span2', 'algn_ref_span1',
       'algn_ref_span2', 'matched_bp1', 'matched_bp2', 'rfrag1',
       'rfrag_start1', 'rfrag_end1', 'rfrag2', 'rfrag_start2', 'rfrag_end2'],
      dtype='object')

In [20]:
def filter_unmapped(df):
    """A function to filter the unmapped reads """
    df = df[(df['chrom1'] != "!") & (df['chrom2'] != "!")].copy()
    return df

def filter_adjacent(df, tolerance=1):
    """A function to filter out fragments
    adjacent on the reference """
    return df[df['rfrag1'].sub(df['rfrag1'].shift(1)).abs() > tolerance].copy()

def filter_close_contact(df, bp=1000):
    """A function to filter contacts which are close"""
    return df[~((df['chrom1'] == df['chrom2']) & (df['pos1'].sub(df['pos2']).abs() < bp))].copy()


def filter_duplicate_contacts(df):
    return (df.assign(rfrag_diff=df['rfrag1'].sub(df['rfrag2']).abs())
              .sort_values('rfrag_diff')
              .drop_duplicates(subset=['rfrag1', 'rfrag2'], keep='first')
              .drop(columns=['rfrag_diff']))


def filter_promiscuous_fragments(df, threshold = 10):
    """Filters out contacts involving promiscuous fragments from a DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing contact information with 'rfrag1' and 'rfrag2' columns.
        threshold (int, optional): The minimum number of interactions for a fragment to be considered promiscuous. Defaults to 10.

    Returns:
        pd.DataFrame: The filtered DataFrame with contacts involving promiscuous fragments removed.
    """

    fragment_counts = (pd.concat([df["rfrag1"], df["rfrag2"]])
                         .value_counts()
                         .reset_index(name="freq")
                         .rename(columns={"index": "fragment_id"}))

    promiscuous_fragments = fragment_counts[fragment_counts["freq"] > threshold]["fragment_id"].tolist()

    return df[~(df["rfrag1"].isin(promiscuous_fragments) | df["rfrag2"].isin(promiscuous_fragments))]


def filter_isolated_fragments(df, max_distance = 10_000_000):
    """Filters out contacts involving isolated fragments from a DataFrame.

    Args:
        df (pd.DataFrame): DataFrame containing contact information with 
            'rfrag1', 'rfrag2', 'chrom1', 'pos1', 'chrom2', 'pos2' columns.
        max_distance (int, optional): The maximum distance to another fragment for a fragment to be considered non-isolated. Defaults to 10,000,000.

    Returns:
        pd.DataFrame: The filtered DataFrame with contacts involving isolated fragments removed.
    """
    
    # Create a combined DataFrame with fragment information
    fragment_data = pd.concat([
        df[['rfrag1', 'chrom1', 'rfrag_start1', 'rfrag_end1']].rename(columns={'rfrag1': 'id', 'chrom1': 'Chromosome', 'rfrag_start1': 'Start', 'rfrag_end1': 'End'}),
        df[['rfrag2', 'chrom2', 'rfrag_start2', 'rfrag_end2']].rename(columns={'rfrag2': 'id', 'chrom2': 'Chromosome', 'rfrag_start2': 'Start', 'rfrag_end2': 'End'}),
    ]).drop_duplicates()

    # Create PyRanges object
    frag_ranges = PyRanges(fragment_data)

    # Find nearest neighbors
    nearest_fragments = frag_ranges.nearest(frag_ranges)

    # Identify isolated fragments
    isolated_fragments = nearest_fragments.df[nearest_fragments.df["Distance"] > max_distance]["id"].tolist()

    # Filter out contacts involving isolated fragments
    return df[~(df["rfrag1"].isin(isolated_fragments) | df["rfrag2"].isin(isolated_fragments))]


pdf = filter_unmapped(df)
print(f"{pdf.shape=}")

pdf = filter_adjacent(pdf)
print(f"{pdf.shape=}")

pdf = filter_close_contact(pdf)
print(f"{pdf.shape=}")

pdf = filter_duplicate_contacts(pdf)
print(f"{pdf.shape=}")

pdf = filter_promiscuous_fragments(pdf)
print(f"{pdf.shape=}")

pdf = filter_isolated_fragments(pdf)
print(f"{pdf.shape=}")

pdf.shape=(190905, 24)
pdf.shape=(189740, 24)
pdf.shape=(131727, 24)
pdf.shape=(6479, 24)
pdf.shape=(4207, 24)
pdf.shape=(4207, 24)


In [31]:
def contact_filter(df):
    """Filters contacts, returning the filtered DataFrame and a summary of filter results.

    Args:
        df (pandas.DataFrame): DataFrame with contact information.

    Returns:
        tuple: A tuple containing:
            - pandas.DataFrame: The final filtered DataFrame.
            - pandas.DataFrame: A DataFrame summarizing the number of rows removed by each filter.
    """

    df = df.reset_index(drop=True).copy()  # Ensure a copy to avoid SettingWithCopyWarning
    summary_data = []

    filters = [
        (filter_unmapped, "unmapped"),
        (filter_adjacent, "adjacent"),
        (filter_close_contact, "close_contact"),
        (filter_duplicate_contacts, "duplicate"),
        (filter_promiscuous_fragments, "promiscuous"),
        (filter_isolated_fragments, "isolated"),
    ]

    for filter_func, filter_name in filters:
        original_size = len(df)
        df = filter_func(df)  # Apply the filter
        filtered_out_count = original_size - len(df) # Calculate rows removed
        summary_data.append({"filter": filter_name, "rows_filtered_out": filtered_out_count}) # Store summary

    summary_df = pd.DataFrame(summary_data)  # Create summary DataFrame
    return df, summary_df

pdf, sum_df = contact_filter(df)
pdf.shape

(4207, 24)

In [32]:
sum_df

Unnamed: 0,filter,rows_filtered_out
0,unmapped,1162234
1,adjacent,1165
2,close_contact,58013
3,duplicate,125248
4,promiscuous,2272
5,isolated,0


In [41]:
def write_pairs(df, header, outpath):
    """A function to write a pairs file """
    with open(outpath, "a") as file:  # Open in append mode
        for line in header:
            file.write(line + "\n") 
        
    df.to_csv(outpath, mode='a', header=None, sep="\t", index=False)
    
    

fpath = '/scratch/indikar_root/indikar1/shared_data/single_cell/expanded/pairs/o1b05.GRCm39.expanded.pairs'
header = phead.get_header(open(fpath))[0]

outpath = "/home/cstansbu/temp/test.pairs"

write_pairs(df, header, outpath)

In [37]:
fpath = '/scratch/indikar_root/indikar1/shared_data/single_cell/expanded/pairs/o1b05.GRCm39.expanded.pairs'
phead.get_header(open(fpath))[0]

['## pairs format v1.0.0',
 '#shape: whole matrix',
 '#genome_assembly: unknown',
 '#chromsize: 1 195154279',
 '#chromsize: 2 181755017',
 '#chromsize: 3 159745316',
 '#chromsize: 4 156860686',
 '#chromsize: 5 151758149',
 '#chromsize: 6 149588044',
 '#chromsize: 7 144995196',
 '#chromsize: 8 130127694',
 '#chromsize: 9 124359700',
 '#chromsize: 10 130530862',
 '#chromsize: 11 121973369',
 '#chromsize: 12 120092757',
 '#chromsize: 13 120883175',
 '#chromsize: 14 125139656',
 '#chromsize: 15 104073951',
 '#chromsize: 16 98008968',
 '#chromsize: 17 95294699',
 '#chromsize: 18 90720763',
 '#chromsize: 19 61420004',
 '#chromsize: X 169476592',
 '#chromsize: Y 91455967',
 '#chromsize: MT 16299',
 '#chromsize: JH584299.1 953012',
 '#chromsize: GL456233.2 559103',
 '#chromsize: JH584301.1 259875',
 '#chromsize: GL456211.1 241735',
 '#chromsize: GL456221.1 206961',
 '#chromsize: JH584297.1 205776',
 '#chromsize: JH584296.1 199368',
 '#chromsize: GL456354.1 195993',
 '#chromsize: JH584298.1 184

In [None]:

'readID', 
'chrom1', 
'pos1',
'chrom2',
'pos2', 
'mapq1',
'mapq2', 
'rfrag1',
'rfrag2', 
