In [1]:
import pandas as pd
from tqdm import tqdm

In [5]:
DATA_NAME = "PBMC10k"
CORE_GENES_FILE = f"../../Datasets/{DATA_NAME}/output/celltypist/celltypist_core_genes.txt"
GTF_FILE = "../../Datasets/gencode.v49.basic.annotation.gtf" 
WINDOW = 2000
OUT_BED = f"../../Datasets/{DATA_NAME}/output/celltypist/core_genes_regions_{WINDOW}bp.bed"

In [3]:
with open(CORE_GENES_FILE) as f:
    core_genes = set(line.strip() for line in f if line.strip())

print(f"Генов в ядре: {len(core_genes)}")

Генов в ядре: 2416


In [4]:
gtf_cols = [
    "chr", "source", "feature", "start", "end", 
    "score", "strand", "frame", "attributes"
]
gtf = pd.read_csv(
    GTF_FILE,
    sep="\t",
    comment="#",
    header=None,
    names=gtf_cols
)

gtf_genes = gtf[gtf["feature"] == "gene"].copy()

def parse_attr(attrs, key):
    for part in attrs.split(";"):
        part = part.strip()
        if not part:
            continue
        if part.startswith(key + " "):
            val = part.split(" ", 1)[1].strip().strip('"')
            return val
    return None

gtf_genes["gene_name"] = gtf_genes["attributes"].apply(
    lambda x: parse_attr(x, "gene_name")
)

In [6]:
gtf_core = gtf_genes[gtf_genes["gene_name"].isin(core_genes)].copy()
print(f"Найдено генов из ядра в GTF: {gtf_core['gene_name'].nunique()}")

Найдено генов из ядра в GTF: 2137


In [7]:
def tss_row(row):
    if row["strand"] == "+":
        return row["start"]
    else:
        return row["end"]

gtf_core["tss"] = gtf_core.apply(tss_row, axis=1)

# 5. делаем окна ±2000 п.н.

In [8]:
bed_rows = []
for _, r in tqdm(gtf_core.iterrows()):
    tss = int(r["tss"])
    # GTF: 1-based, BED: 0-based half-open
    start = max(0, tss - 1 - WINDOW)
    end = tss - 1 + WINDOW + 1   # = tss + WINDOW
    bed_rows.append({
        "chr": r["chr"],
        "start": start,
        "end": end,
        "name": r["gene_name"],
        "strand": r["strand"],
    })

bed = pd.DataFrame(bed_rows)
bed.sort_values(["chr", "start", "end"], inplace=True)

bed.to_csv(OUT_BED, sep="\t", header=False, index=False)
print(f"Промоторные окна TSS±{WINDOW} сохранены в {OUT_BED}")

2329it [00:00, 21575.57it/s]

Промоторные окна TSS±2000 сохранены в ../../Datasets/PBMC10k/output/celltypist/core_genes_regions_2000bp.bed



