# ChIP-seq FASTA Extraction from AllPeaks Files  
This notebook loads peak tables (`*_AllPeaks.tsv`), filters regions with `EnrichmentFactor ≥ 4`,  
and extracts ±100 bp FASTA windows around the `AvgMaxPos` coordinate for each peak.

It uses two reference FASTA files (chromosome I and II of *Vibrio cholerae*),  
stored under `docs/`.

Outputs include:

- An Excel file with full and filtered peak tables  
- Two FASTA files for each strain:
  - `*_all.fa` — all peaks
  - `*_ge4.fa` — only EF ≥ 4

This notebook is designed for **Google Colab** and **reproducibility**, using only repository-local files.


In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip -q install pandas openpyxl pyfaidx


In [3]:
import pandas as pd
from pyfaidx import Fasta
from pathlib import Path

# ============================
# 1. DIRECTORIES (relative paths for reproducibility)
# ============================

RAW_DIR  = Path("data_example/allpeaks/")          # folder with *AllPeaks.tsv
OUT_DIR  = Path("results/fasta_outputs/")          # where FASTAs + XLSX will be saved
REF_505  = Path("data_example/reference/NC_002505.fasta")
REF_506  = Path("data_example/reference/NC_002506.fasta")

OUT_DIR.mkdir(parents=True, exist_ok=True)

# Column mapping (expected in AllPeaks TSV)
COLS = {
    "region_number": "RegionNumber",
    "chrom":         "Chromosome",
    "avgmax":        "AvgMaxPos",
    "ef":            "EnrichmentFactor",
    "start":         "RegionStart",
    "end":           "RegionEnd"
}

WINDOW    = 100     # ±100 bp window around AvgMaxPos
ONE_BASED = True    # AvgMaxPos is 1-based


In [4]:
# --- check FASTA files ---
if not REF_505.exists():
    raise FileNotFoundError(f"Reference FASTA not found: {REF_505}")
if not REF_506.exists():
    raise FileNotFoundError(f"Reference FASTA not found: {REF_506}")

fa_505 = Fasta(str(REF_505), as_raw=True, sequence_always_upper=True)
fa_506 = Fasta(str(REF_506), as_raw=True, sequence_always_upper=True)


In [5]:
def to_zero_based_from_center(center:int, win:int, one_based:bool, contig_len:int):
    """Convert a 1-based center coord into 0-based half-open interval for pyfaidx."""
    if one_based:
        s0 = max(0, center - 1 - win)
        e0 = min(contig_len, center - 1 + win + 1)
    else:
        s0 = max(0, center - win)
        e0 = min(contig_len, center + win)
    return s0, e0


def choose_fasta_by_chrom(chrom_raw: str):
    """Pick chromosome I (NC_002505) or II (NC_002506)."""
    c = str(chrom_raw)
    if c.startswith("NC_002505") or "2505" in c:
        return fa_505
    if c.startswith("NC_002506") or "2506" in c:
        return fa_506
    return None


def resolve_contig(fa: Fasta, chrom_raw: str):
    """Resolve contig name inside FASTA (handles minor name mismatches)."""
    chrom_raw = str(chrom_raw)
    keys = set(fa.keys())
    candidates = [chrom_raw]

    if "." in chrom_raw:
        candidates.append(chrom_raw.split(".")[0])
    if chrom_raw.startswith("chr"):
        candidates.append(chrom_raw[3:])
    else:
        candidates.append("chr" + chrom_raw)
    candidates.append(chrom_raw.replace("NC_", ""))

    for cand in candidates:
        if cand in keys:
            return cand

    if len(keys) == 1:
        return list(keys)[0]

    return None


def write_fasta(df: pd.DataFrame, out_path: Path, strain: str, tag: str):
    """Write FASTA ±WINDOW around AvgMaxPos for each row."""
    n_ok, n_skip = 0, 0

    with open(out_path, "w") as fh:
        for _, row in df.iterrows():
            rn     = int(row[COLS["region_number"]])
            chrom  = row[COLS["chrom"]]
            center = int(row[COLS["avgmax"]])

            fa = choose_fasta_by_chrom(chrom)
            if fa is None:
                n_skip += 1
                continue

            contig = resolve_contig(fa, chrom)
            if contig is None:
                n_skip += 1
                continue

            contig_len = len(fa[contig])
            s0, e0 = to_zero_based_from_center(center, WINDOW, ONE_BASED, contig_len)

            seq = fa[contig][s0:e0]
            seq = seq.upper() if not isinstance(seq, str) else seq.upper()

            header = f">{strain}_{tag}_R{rn}_{s0+1}-{e0}"
            fh.write(header + "\n")
            for i in range(0, len(seq), 60):
                fh.write(seq[i:i+60] + "\n")

            n_ok += 1

    print(f"FASTA saved: {out_path.name} | sequences: {n_ok} | skipped: {n_skip}")


In [11]:
# ============================
# 4. PIPELINE
# ============================

for tsv in RAW_DIR.glob("*AllPeaks.csv"):
    strain = tsv.stem.replace("_AllPeaks", "")
    print(f"\n▶ Processing {strain} ...")

    df = pd.read_csv(tsv, dtype={COLS["chrom"]: "string"})
    df[COLS["ef"]] = pd.to_numeric(df[COLS["ef"]], errors="coerce")

    df[COLS["avgmax"]] = (
        df[COLS["avgmax"]]
        .astype(str)
        .str.replace(",", "", regex=False)
        .astype(int)
    )

    df_ge4 = df[df[COLS["ef"]] >= 4].copy()

    df_ge4[COLS["avgmax"]] = (
        df_ge4[COLS["avgmax"]]
        .astype(str)
        .str.replace(",", "", regex=False)
        .astype(int)
    )


    # Write XLSX
    out_xlsx = OUT_DIR / f"{strain}_AllPeaks.xlsx"
    with pd.ExcelWriter(out_xlsx, engine="openpyxl") as xw:
        df.to_excel(xw, index=False, sheet_name="AllPeaks")
        df_ge4.to_excel(xw, index=False, sheet_name="EF_ge4")

    # FASTA files
    out_all_fa = OUT_DIR / f"{strain}_all.fa"
    out_ge4_fa = OUT_DIR / f"{strain}_ge4.fa"

    write_fasta(df,      out_all_fa, strain, "all")
    write_fasta(df_ge4,  out_ge4_fa, strain, "ge4")

    print(f"✔ {strain}: XLSX + FASTA saved in {OUT_DIR}")

print("\n=== Completed ===")



▶ Processing mock_strainA ...
FASTA saved: mock_strainA_all.fa | sequences: 12 | skipped: 0
FASTA saved: mock_strainA_ge4.fa | sequences: 12 | skipped: 0
✔ mock_strainA: XLSX + FASTA saved in /content/drive/MyDrive/example/res

▶ Processing mock_strainB ...
FASTA saved: mock_strainB_all.fa | sequences: 12 | skipped: 0
FASTA saved: mock_strainB_ge4.fa | sequences: 12 | skipped: 0
✔ mock_strainB: XLSX + FASTA saved in /content/drive/MyDrive/example/res

=== Completed ===
