# ChIP-seq – Cluster nearby peaks and filter by enrichment

This notebook combines multiple `*_AllPeaks.csv` files (each corresponding to a strain
or condition), groups nearby regions within a distance threshold on each chromosome,
and produces a wide table with one row per cluster and columns for each strain.

Missing `EnrichmentFactor` values are filled with a minimum value (default: 1.5), and
clusters are filtered to keep only those with at least one `EnrichmentFactor ≥ 4`
across strains.

> **This notebook is intended as a mock/example workflow** using the files  
> `mock_strainA_AllPeaks.csv` and `mock_strainB_AllPeaks.csv`.  
> In a real analysis, you would replace these with your actual `*_AllPeaks.csv` files.

---

## Expected input

Files under `DATA_DIR`, each named:

- `something_AllPeaks.csv` (e.g. `mock_strainA_AllPeaks.csv`)

with at least the following columns:

- `Chromosome`
- `RegionStart`
- `RegionEnd`
- `EnrichmentFactor`

Other columns (such as `RegionNumber`, `AvgMaxPos`, `RegionMiddle`, `NumFiles`,
`Whichfiles`) are preserved and will also be suffixed per strain.

## Output

A tab-separated file `OUT_TSV` (by default under `results/chipseq/`) containing:

- one row per clustered region (`cluster` index),
- one set of suffixed columns per strain (e.g. `EnrichmentFactor_strainA`,
  `RegionStart_strainB`, etc.),
- only clusters where at least one `EnrichmentFactor_* ≥ 4`.


In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# @title Step 1 – Imports and basic parameters

import os
import glob
import pandas as pd

# Directory containing the *_AllPeaks.csv files
# For the public repository, this should point to a folder under data_example.
# Example layout:
#   data_example/chipseq/mock_strainA_AllPeaks.csv
#   data_example/chipseq/mock_strainB_AllPeaks.csv
DATA_DIR = "/content/drive/MyDrive/example/"   # adjust to match your repo layout

# Maximum distance (in bp) to consider two peaks as belonging to the same cluster
THRESHOLD = 200

# Directory and file for output
RESULTS_DIR = "/content/drive/MyDrive/example/res/"
os.makedirs(RESULTS_DIR, exist_ok=True)

OUT_TSV = os.path.join(RESULTS_DIR, "paired_regions.tsv")

# Minimum EnrichmentFactor value used to fill missing values
MIN_ENRICHMENT = 1.5

# Enrichment threshold used to keep clusters
ENRICHMENT_THRESHOLD = 4.0


In [3]:
!ls -l /content/drive/MyDrive/example/

total 6
-rw------- 1 root root  865 Dec  9 15:43 mock_strainA_AllPeaks.csv
-rw------- 1 root root  865 Dec  9 15:43 mock_strainB_AllPeaks.csv
drwx------ 2 root root 4096 Dec  9 15:58 res


In [4]:
# @title Step 2 – Function to load one file and apply a strain suffix

def load_and_suffix(fp):
    """
    Load a single *_AllPeaks.csv file, keep original Chromosome/RegionStart
    in dedicated columns for clustering, and suffix all other columns with
    a strain-specific label.

    Parameters
    ----------
    fp : str
        Path to a CSV file.

    Returns
    -------
    df : pandas.DataFrame
        DataFrame with suffixed columns and metadata.
    """
    base = os.path.basename(fp).replace("_AllPeaks.csv", "")

    # Map from raw file base name to a short strain label
    # For this mock example:
    #   mock_strainA_AllPeaks.csv -> strainA
    #   mock_strainB_AllPeaks.csv -> strainB
    suffix_map = {
        "mock_strainA": "strainA",
        "mock_strainB": "strainB",
    }

    suffix = suffix_map.get(base, base)

    df = pd.read_csv(fp)

    # Basic sanity checks
    required_cols = ["Chromosome", "RegionStart"]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in {fp}")

    # Preserve original keys for clustering
    df["Chromosome_orig"] = df["Chromosome"]
    df["RegionStart_orig"] = df["RegionStart"]

    # Columns that must NOT be suffixed
    cols_to_keep = ["Chromosome_orig", "RegionStart_orig"]

    # Build renaming map: every other column gets _<suffix>
    rename_map = {
        col: f"{col}_{suffix}"
        for col in df.columns
        if col not in cols_to_keep
    }
    df = df.rename(columns=rename_map)

    # Add strain label
    df["strain"] = suffix

    return df


In [5]:
# @title Step 3 – Load all *_AllPeaks.csv files

pattern = os.path.join(DATA_DIR, "*_AllPeaks.csv")
csv_files = glob.glob(pattern)

if not csv_files:
    raise FileNotFoundError(
        f"No *_AllPeaks.csv files found in {DATA_DIR}. "
        "Check the DATA_DIR path and file names."
    )

print(f"Found {len(csv_files)} file(s):")
for f in csv_files:
    print(" -", os.path.basename(f))

dfs = [load_and_suffix(f) for f in csv_files]
all_df = pd.concat(dfs, ignore_index=True)

print(f"Total combined rows: {len(all_df)}")
all_df.head()


Found 2 file(s):
 - mock_strainA_AllPeaks.csv
 - mock_strainB_AllPeaks.csv
Total combined rows: 24


Unnamed: 0,RegionNumber_strainA,Chromosome_strainA,AvgMaxPos_strainA,RegionMiddle_strainA,NumFiles_strainA,RegionStart_strainA,RegionEnd_strainA,EnrichmentFactor_strainA,Whichfiles_strainA,Chromosome_orig,...,strain,RegionNumber_strainB,Chromosome_strainB,AvgMaxPos_strainB,RegionMiddle_strainB,NumFiles_strainB,RegionStart_strainB,RegionEnd_strainB,EnrichmentFactor_strainB,Whichfiles_strainB
0,1.0,NC_002506.1,512344,512700,3.0,512100,513000,37.8,CAB,NC_002506.1,...,strainA,,,,,,,,,
1,2.0,NC_002506.1,902112,902550,3.0,901880,903120,22.4,CAB,NC_002506.1,...,strainA,,,,,,,,,
2,3.0,NC_002506.1,31245,31890,3.0,30800,32050,29.6,CAB,NC_002506.1,...,strainA,,,,,,,,,
3,4.0,NC_002506.1,678901,679240,3.0,678500,679350,41.2,CAB,NC_002506.1,...,strainA,,,,,,,,,
4,5.0,NC_002506.1,843220,843680,3.0,842900,844100,18.9,CAB,NC_002506.1,...,strainA,,,,,,,,,


In [8]:
# @title Step 4 – Cluster peaks, pivot per strain, fill NaNs and filter

## 1) Clean RegionStart_orig -> int, then sort
all_df["RegionStart_orig"] = (
    all_df["RegionStart_orig"]
    .astype(str)
    .str.replace(",", "", regex=False)
    .astype(int)
)

all_df = all_df.sort_values(["Chromosome_orig", "RegionStart_orig"]).reset_index(drop=True)

cluster_id = 0
cluster_centers = {}  # dict: chrom -> list of (center_pos, cluster_id)

for idx, row in all_df.iterrows():
    chrom = row["Chromosome_orig"]
    pos = row["RegionStart_orig"]

    if chrom not in cluster_centers:
        cluster_centers[chrom] = []

    # Look for an existing cluster center within THRESHOLD
    match = None
    for center, cid in cluster_centers[chrom]:
        if abs(pos - center) <= THRESHOLD:
            match = (center, cid)
            break

    if match is None:
        # Create a new cluster
        cid = cluster_id
        cluster_id += 1
        cluster_centers[chrom].append((pos, cid))
    else:
        center, cid = match
        # Optional: update cluster center as integer average of old center and new pos
        new_center = (center + pos) // 2
        cluster_centers[chrom] = [
            (new_center, cid_) if cid_ == cid else (c_, cid_)
            for (c_, cid_) in cluster_centers[chrom]
        ]

    all_df.at[idx, "cluster"] = cid

all_df["cluster"] = all_df["cluster"].astype(int)

n_clusters = all_df["cluster"].nunique()
print(f"Total number of clusters formed: {n_clusters}")

# 2) Pivot to wide format: one row per cluster, suffixed columns per strain
wide = None
for strain, sub in all_df.groupby("strain"):
    # select only columns belonging to this strain + cluster
    strain_cols = [c for c in sub.columns if c.endswith(f"_{strain}")] + ["cluster"]
    tmp = sub[strain_cols].drop_duplicates().set_index("cluster")
    if wide is None:
        wide = tmp
    else:
        wide = wide.join(tmp, how="outer")

# Reset index to have a simple integer index again
wide = wide.reset_index(drop=True)

print(f"Shape of wide table before filtering: {wide.shape}")

# 3) Fill NaNs in all EnrichmentFactor_* columns with MIN_ENRICHMENT
ef_cols = [c for c in wide.columns if c.startswith("EnrichmentFactor_")]
if not ef_cols:
    raise ValueError(
        "No 'EnrichmentFactor_*' columns found in the wide table. "
        "Check that the input files contain an 'EnrichmentFactor' column."
    )

wide[ef_cols] = wide[ef_cols].fillna(MIN_ENRICHMENT)

# 4) Keep only clusters with at least one EnrichmentFactor_* ≥ ENRICHMENT_THRESHOLD
mask = (wide[ef_cols] >= ENRICHMENT_THRESHOLD).any(axis=1)
wide = wide.loc[mask].reset_index(drop=True)

print(f"Shape of wide table after filtering: {wide.shape}")

# 5) Save to TSV
wide.to_csv(OUT_TSV, sep="\t", index=False)
print(f"Saved clustered and filtered table to: {OUT_TSV}")
wide.head()


Total number of clusters formed: 12
Shape of wide table before filtering: (12, 18)
Shape of wide table after filtering: (12, 18)
Saved clustered and filtered table to: /content/drive/MyDrive/example/res/paired_regions.tsv


Unnamed: 0,RegionNumber_strainA,Chromosome_strainA,AvgMaxPos_strainA,RegionMiddle_strainA,NumFiles_strainA,RegionStart_strainA,RegionEnd_strainA,EnrichmentFactor_strainA,Whichfiles_strainA,RegionNumber_strainB,Chromosome_strainB,AvgMaxPos_strainB,RegionMiddle_strainB,NumFiles_strainB,RegionStart_strainB,RegionEnd_strainB,EnrichmentFactor_strainB,Whichfiles_strainB
0,10.0,NC_002505.1,890443,890900,3.0,890100,891200,12.8,CAB,10.0,NC_002505.1,890443,890900,3.0,890100,891200,12.8,CAB
1,6.0,NC_002505.1,1250940,1251400,3.0,1250600,1251700,26.7,CAB,6.0,NC_002505.1,1250940,1251400,3.0,1250600,1251700,26.7,CAB
2,11.0,NC_002505.1,1678220,1678590,3.0,1677900,1678950,23.4,CAB,11.0,NC_002505.1,1678220,1678590,3.0,1677900,1678950,23.4,CAB
3,9.0,NC_002505.1,2562880,2563210,3.0,2562400,2563500,19.3,CAB,9.0,NC_002505.1,2562880,2563210,3.0,2562400,2563500,19.3,CAB
4,8.0,NC_002505.1,2801505,2801900,3.0,2800900,2802300,31.1,CAB,8.0,NC_002505.1,2801505,2801900,3.0,2800900,2802300,31.1,CAB
