In [1]:
import os, sys, argparse, hashlib
from typing import List, Tuple
import numpy as np
import pandas as pd

In [2]:
RC_MAP = str.maketrans("ACGTNacgtn", "TGCANtgcan")

def revcomp(seq: str) -> str:
    return seq.translate(RC_MAP)[::-1]

def canonical(seq: str) -> str:
    rc = revcomp(seq)
    return seq if seq <= rc else rc

def md5(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()

def sanitize(seq: str) -> str:
    """Uppercase and replace non-ACGT with N."""
    s = (seq or "").upper()
    out = []
    for ch in s:
        if ch in ("A","C","G","T","N"):
            out.append(ch)
        else:
            out.append("N")
    return "".join(out)

def char_to_digit_map():
    return {"A":0, "C":1, "G":2, "T":3}

def kmer_to_id(token: str, k: int, c2d) -> int:
    """Convert an A/C/G/T k-mer to a base-4 integer id; caller must ensure no 'N' present."""
    val = 0
    for ch in token:
        val = (val << 2) | c2d[ch]   # *4 + digit
    return val

In [3]:
def stratified_group_splits(df_groups: pd.DataFrame, val_frac=0.1, test_frac=0.1, seed=1337):
    """
    df_groups columns: group, n, pos_frac (enhancer_label mean)
    Simple stratification by binning pos_frac and sampling groups.
    """
    bins = [0.0, 0.001, 0.01, 0.1, 0.3, 0.7, 1.0]
    df_groups = df_groups.copy()
    df_groups["bin"] = pd.cut(df_groups["pos_frac"], bins=bins, include_lowest=True, labels=False)
    train_g, val_g, test_g = [], [], []
    for b in sorted(df_groups["bin"].dropna().unique()):
        sub = df_groups[df_groups["bin"]==b].sample(frac=1.0, random_state=seed)  # shuffle
        n = len(sub)
        n_test = int(round(test_frac * n))
        n_val  = int(round(val_frac  * n))
        test_g += sub.group.iloc[:n_test].tolist()
        val_g  += sub.group.iloc[n_test:n_test+n_val].tolist()
        train_g += sub.group.iloc[n_test+n_val:].tolist()
    return set(train_g), set(val_g), set(test_g)

In [4]:
def resolve_sep(path: str, sep_arg: str, mode: str) -> str:
    """
    Decide separator.
    mode: "in" or "out"
    - If sep_arg provided and not 'auto', respect it (map 'tab'/'comma').
    - Else infer from file extension: *.tsv, *.tsv.gz, *.tab, *.txt -> tab; otherwise comma.
    """
    if sep_arg and sep_arg.lower() != "auto":
        s = sep_arg.lower()
        if s in ("tab", "\\t"):
            return "\t"
        if s in ("comma", ","):
            return ","
        # allow literal characters like ';' or '\t'
        return "\t" if s == "\t" else s
    ext = (path or "").lower()
    if any(ext.endswith(suf) for suf in (".tsv", ".tsv.gz", ".tab", ".tab.gz", ".txt")):
        return "\t"
    # default
    return ","

In [5]:
in_sep = resolve_sep("GRCh38-cCREs.trainready.minimal 1.tsv", "auto", mode="in")
out_sep = resolve_sep("GRCh38-cCREs_out.tsv", "auto", mode="out")

In [7]:
# Load input
df = pd.read_csv("GRCh38-cCREs.trainready.minimal.tsv", sep=in_sep)
if "sequence" not in df.columns or "enhancer_label" not in df.columns:
    raise SystemExit("Input must include 'sequence' and 'enhancer_label'.")
tissue_cols = [c for c in df.columns if c.startswith("active_")]
if len(tissue_cols) == 0:
    raise SystemExit("No tissue columns found (expected columns starting with 'active_').")

In [8]:
# Basic hygiene
df["sequence"] = df["sequence"].astype(str).map(sanitize)
N = len(df)


In [9]:
# Compute canonical & group for leakage-safe downstream splits
df["canon"] = df["sequence"].map(canonical)
df["group"] = df["canon"].map(md5)


In [10]:
# K-mer settings
k = 3
UNK_ID = 4 ** k
c2d = char_to_digit_map()

In [11]:
 # Determine T_i and T_max
Ls = df["sequence"].str.len().to_numpy()
Ti = np.maximum(Ls - k + 1, 0)  # real tokens per row (0 if L<k)
T_max = int(max(int(Ti.max()), 1))  # ensure at least 1 column
Ti, T_max

(array([215, 326, 282, ..., 247, 162, 254]), 348)

In [12]:
# Prepare arrays for k-mer IDs and masks
kids = np.full((N, T_max), UNK_ID, dtype=np.int32)
mask = np.zeros((N, T_max), dtype=np.int8)

In [13]:
# Fill token IDs and masks
for i, seq in enumerate(df["sequence"]):
    t = Ti[i]
    if t <= 0:
        continue  # all padding (mask=0) and UNK ids already set
    for j in range(t):
        tok = seq[j:j+k]
        if "N" in tok:
            kids[i, j] = UNK_ID
        else:
            kids[i, j] = kmer_to_id(tok, k, c2d)
        mask[i, j] = 1


In [14]:
# Assemble output DataFrame
include_sequence = True
base_cols = []
base_cols.append(pd.Series(np.arange(N, dtype=np.int64), name="orig_idx"))
base_cols.append(df["enhancer_label"].astype(np.uint8))
for c in tissue_cols:
    base_cols.append(df[c].astype(np.uint8))
if include_sequence:
    base_cols.append(df["sequence"])
base_cols.append(df["group"])

out_df = pd.concat(base_cols, axis=1)

In [15]:
# Column names so far
fixed_names = ["orig_idx", "enhancer_label"] + tissue_cols
if include_sequence:
    fixed_names += ["sequence"]
fixed_names += ["group"]
out_df.columns = fixed_names

In [16]:
# Add k-mer ID and mask columns
kid_cols = [f"kid_{i}" for i in range(T_max)]
mask_cols = [f"mask_{i}" for i in range(T_max)]

kids_df = pd.DataFrame(kids, columns=kid_cols)
mask_df = pd.DataFrame(mask, columns=mask_cols)

out_df = pd.concat([out_df, kids_df, mask_df], axis=1)

In [17]:
# Optional split column (group-level stratified on enhancer_label)
add_split = False
val_frac=0.0
test_frac=0.0

seed=41

if add_split:
    g = df.groupby("group")["enhancer_label"].agg(["count","mean"]).reset_index()
    g.columns = ["group","n","pos_frac"]
    train_g, val_g, test_g = stratified_group_splits(g, val_frac, test_frac, seed)
    split = np.full(N, "train", dtype=object)
    is_val = df["group"].isin(val_g).to_numpy()
    is_test = df["group"].isin(test_g).to_numpy()
    split[is_val] = "val"
    split[is_test] = "test"
    out_df.insert(len(fixed_names), "split", split)

In [18]:

len(out_df), df.columns

(2348854,
 Index(['sequence', 'enhancer_label', 'active_blood', 'active_brain',
        'active_embryo', 'active_heart', 'active_kidney',
        'active_large_intestine', 'active_lung', 'active_muscle', 'active_skin',
        'canon', 'group'],
       dtype='object'))

In [22]:
# Save table
output = "/Users/mahraalghfeli/Desktop/GRCh38-cCREs_out.tsv"
# output = "GRCh38-cCREs_out.tsv"
out_path = output
compress = "gzip" if out_path.endswith(".gz") else None
out_df.to_csv(out_path, index=False, compression=compress, sep=out_sep)

In [20]:
# Summary to stdout
total_cols = out_df.shape[1]
print(f"Rows: {N}")
print(f"k: {k}, UNK_ID: {UNK_ID}")
print(
    f"Input sep: {'TAB' if in_sep == chr(9) else repr(in_sep)}, "
    f"Output sep: {'TAB' if out_sep == chr(9) else repr(out_sep)}"
)
print(f"Max sequence length: {int(Ls.max())}, T_max (tokens): {T_max}")
print(f"k-mer ID columns: {len(kid_cols)}, mask columns: {len(mask_cols)}")
print(f"TOTAL columns in table: {total_cols}")
print(f"Wrote: {out_path}")

Rows: 2348854
k: 3, UNK_ID: 64
Input sep: TAB, Output sep: TAB
Max sequence length: 350, T_max (tokens): 348
k-mer ID columns: 348, mask columns: 348
TOTAL columns in table: 709
Wrote: GRCh38-cCREs_out.tsv


In [None]:
out_df.head()

Unnamed: 0,orig_idx,enhancer_label,active_blood,active_brain,active_embryo,active_heart,active_kidney,active_large_intestine,active_lung,active_muscle,...,mask_338,mask_339,mask_340,mask_341,mask_342,mask_343,mask_344,mask_345,mask_346,mask_347
0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
