In [29]:
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.io import mmwrite
import os
import re

TAGCOUNT = "TagCount.txt"
STAT     = "stat.txt"
OUTDIR   = "pbmc_10x"

In [30]:
df = pd.read_csv(TAGCOUNT, sep="\t", compression="infer", low_memory=False)

In [31]:
df.head()

Unnamed: 0,AAACAGCCAAGGAATC,AAACAGCCAATCCCTT,AAACAGCCAATGCGCT,AAACAGCCAGTAGGTG,AAACAGCCAGTTTACG,AAACAGCCATCCAGGT,AAACATGCAAGGTCCT,AAACATGCACCGGCTA,AAACATGCACTTGTTC,AAACATGCAGCAAGTG,...,TTTGTGTTCGATAACC,TTTGTGTTCGGGACCT,TTTGTGTTCGTCAAGT,TTTGTGTTCTCCATAT,TTTGTTGGTCAGGAAG,TTTGTTGGTGACATGC,TTTGTTGGTGTTAAAC,TTTGTTGGTTAGGATT,TTTGTTGGTTGGTTAG,TTTGTTGGTTTGCAGA
chr1_10109_10357,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_180730_181630,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_191491_191736,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_267816_268196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr1_586028_586373,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
barcodes = df.columns.astype(str).tolist()
peaks = df.index.astype(str).tolist()

In [33]:
mat = sparse.csr_matrix(df.to_numpy(dtype=float, copy=False))

In [34]:
os.makedirs(OUTDIR, exist_ok=True)
mmwrite(os.path.join(OUTDIR, "matrix.mtx"), mat)

with open(os.path.join(OUTDIR, "barcodes.txt"), "w") as f:
    f.write("\n".join(barcodes) + "\n")

In [35]:
def peak_to_bed(p):
    # ожидаем ровно 3 части, разделённые '_'
    parts = p.split("_")
    if len(parts) == 3:
        chrom, start, end = parts
        # добавим 'chr', если вдруг нет (у тебя уже есть)
        if not chrom.startswith("chr"):
            chrom = "chr" + chrom
        return f"{chrom}\t{start}\t{end}"
    # запасной вариант для формата chr1:101-357
    m = re.match(r'^(chr)?([0-9XYM]+)[:_](\d+)[-_](\d+)$', p)
    if m:
        _, chrom, start, end = m.groups()
        return f"chr{chrom}\t{start}\t{end}"
    return None

bed_lines = [peak_to_bed(p) for p in peaks]
bed_lines = [x for x in bed_lines if x is not None]

with open(os.path.join(OUTDIR, "peaks.txt"), "w") as f:
    f.write("\n".join(bed_lines) + "\n")

In [36]:
lab = pd.read_csv(STAT, sep="\t", usecols=["Cell","CellType"])
lab = lab.set_index("Cell").reindex(barcodes)["CellType"].fillna("unknown")
lab.to_csv(os.path.join(OUTDIR, "labels.tsv"), sep="\t", header=False)

print(f"Done. peaks={mat.shape[0]}, cells={mat.shape[1]}, nnz={mat.nnz}")
print("Example barcode:", barcodes[0])
print("Example peak:", peaks[0])

Done. peaks=106935, cells=10032, nnz=72208619
Example barcode: AAACAGCCAAGGAATC
Example peak: chr1_10109_10357


In [37]:
df.tail()

Unnamed: 0,AAACAGCCAAGGAATC,AAACAGCCAATCCCTT,AAACAGCCAATGCGCT,AAACAGCCAGTAGGTG,AAACAGCCAGTTTACG,AAACAGCCATCCAGGT,AAACATGCAAGGTCCT,AAACATGCACCGGCTA,AAACATGCACTTGTTC,AAACATGCAGCAAGTG,...,TTTGTGTTCGATAACC,TTTGTGTTCGGGACCT,TTTGTGTTCGTCAAGT,TTTGTGTTCTCCATAT,TTTGTTGGTCAGGAAG,TTTGTTGGTGACATGC,TTTGTTGGTGTTAAAC,TTTGTTGGTTAGGATT,TTTGTTGGTTGGTTAG,TTTGTTGGTTTGCAGA
chrX_155841301_155841724,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
chrX_155880572_155882091,0,0,0,0,0,0,0,0,2,2,...,0,0,2,0,0,0,0,0,0,0
chrX_155966975_155967181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrX_155997360_155997882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chrX_156030027_156030149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
