# Extract general embeddings from ChromBERT

In [1]:
# ChromBERT-tools API Examples
# 
# This notebook demonstrates how to use the ChromBERT-tools Python API
# to extract various types of embeddings from genomic data.

from chrombert_tools import embed_cistrome, embed_gene, embed_region, embed_regulator

In [2]:
import os
os.chdir("/mnt/Storage2/home/chenqianqian/projects/chrombert/chrombert_tools/ChromBERT-tools/examples/api")

### embed cistrome

In [3]:
# Returns:
# - mean_cistrome_emb_dict: Dict[cistrome_name, mean_embedding], shape (768,)
# - cistrome_emb_dict: Dict[cistrome_name, per_region_embeddings], shape (N_regions, 768)
# - regions: DataFrame with columns [chrom, start, end, build_region_index]
#   build_region_index refers to ChromBERT's reference regions

mean_cistrome_emb_dict, cistrome_emb_dict, regions = embed_cistrome(
    region="../data/CTCF_ENCFF664UGR_sample100.bed", # your region
    cistrome="ENCSR440VKE_2;GSM1208591;ATAC-seq:HEK293T;BCL11A:GM12878", # your cistrome
    odir="./output_emb_cistrome", # output directory
    genome="hg38",        # Options: "hg38", "mm10"
    resolution="1kb",     # Options: "1kb", "2kb", "4kb", "200bp"
)

Region summary - total: 100, overlapping with ChromBERT: 100 (one region may overlap multiple ChromBERT regions), non-overlapping: 0
Note: All cistrome names were converted to lowercase for matching.
Cistromes count summary - requested: 4, matched in ChromBERT meta: 4, not found: 0, not found cistromes: []
ChromBERT cistromes metas: /mnt/Storage/home/chenqianqian/.cache/chrombert/data/config/hg38_6k_meta.tsv
Your supervised_file does not contain the 'label' column. Please verify whether ground truth column ('label') is required. If it is not needed, you may disregard this message.
Your supervised_file does not contain the 'label' column. Please verify whether ground truth column ('label') is required. If it is not needed, you may disregard this message.
use organisim hg38; max sequence length is 6391


100%|██████████| 2/2 [00:02<00:00,  1.31s/it]

Finished!
Saved mean cistrome embeddings to pickle file: ./output_emb_cistrome/mean_cistrome_emb.pkl
Saved cistrome embeddings to hdf5 file: ./output_emb_cistrome/cistrome_emb_on_region.hdf5





In [4]:
# mean_cistrome_emb_dict: one 768-dim vector per cistrome (averaged across all regions)
for key, value in mean_cistrome_emb_dict.items():
    print(key, value.shape)

encsr440vke_2 (768,)
gsm1208591 (768,)
atac-seq:hek293t (768,)
bcl11a:gm12878 (768,)


In [5]:
mean_cistrome_emb_dict['encsr440vke_2'][0:10]

array([-1.07020508e+00, -2.44023438e+00, -9.73486328e-02,  9.52148437e-04,
       -2.79785156e-03,  2.34894531e+00,  1.77441406e+00,  5.35874023e-01,
       -3.73691406e-01,  1.63013184e+00])

In [6]:
# cistrome_emb_dict: per-region embeddings; each cistrome has a matrix of shape (N_regions, 768)
for key, value in cistrome_emb_dict.items():
    print(key, value.shape)


encsr440vke_2 (100, 768)
gsm1208591 (100, 768)
atac-seq:hek293t (100, 768)
bcl11a:gm12878 (100, 768)


In [7]:
# Region metadata: [chrom, start, end, build_region_index]
# build_region_index maps to ChromBERT's reference regions
regions[0:10]

Unnamed: 0,chrom,start,end,build_region_index
0,1,37989946,37990368,32658
1,11,2400199,2400617,289179
2,12,6778809,6779319,391108
3,12,52980788,52981316,424926
4,12,53676021,53676448,425578
5,14,21092401,21092968,560876
6,14,23057979,23058458,562483
7,14,23120727,23121190,562542
8,14,23379895,23380314,562781
9,14,23588973,23589439,562958


In [8]:
# Example with CSV input file
mean_cistrome_emb, cistrome_emb_dict, regions = embed_cistrome(
    region="../data/CTCF_ENCFF664UGR_sample100.csv", # your region
    cistrome="ENCSR440VKE_2;GSM1208591;ATAC-seq:HEK293T;BCL11A:GM12878", # your cistrome
    odir="./output_emb_cistrome2", # output directory
    genome="hg38",        # Options: "hg38", "mm10"
    resolution="1kb",     # Options: "1kb", "2kb", "4kb", "200bp"
)

Region summary - total: 100, overlapping with ChromBERT: 100 (one region may overlap multiple ChromBERT regions), non-overlapping: 0
Note: All cistrome names were converted to lowercase for matching.
Cistromes count summary - requested: 4, matched in ChromBERT meta: 4, not found: 0, not found cistromes: []
ChromBERT cistromes metas: /mnt/Storage/home/chenqianqian/.cache/chrombert/data/config/hg38_6k_meta.tsv
Your supervised_file does not contain the 'label' column. Please verify whether ground truth column ('label') is required. If it is not needed, you may disregard this message.
Your supervised_file does not contain the 'label' column. Please verify whether ground truth column ('label') is required. If it is not needed, you may disregard this message.
use organisim hg38; max sequence length is 6391


100%|██████████| 2/2 [00:02<00:00,  1.04s/it]

Finished!
Saved mean cistrome embeddings to pickle file: ./output_emb_cistrome2/mean_cistrome_emb.pkl
Saved cistrome embeddings to hdf5 file: ./output_emb_cistrome2/cistrome_emb_on_region.hdf5





### embed gene

In [9]:
# gene_emb_dict: Dict[gene_name, embedding_vector]
# Each embedding is a 768-dimensional vector
gene_emb_dict = embed_gene(
    gene="ENSG00000170921;TANC2;ENSG00000200997;DPYD;SNORA70;tp53;brd4", # your genes
    odir="./output_emb_genes", # output directory
    genome="hg38",        # Options: "hg38", "mm10"
    resolution="1kb",     # Options: "1kb", "2kb", "4kb", "200bp"
)

Finished!
Note: All gene names were converted to lowercase for matching.
Gene count summary - requested: 7, matched: 7, not found: 0
Gene meta file: /mnt/Storage/home/chenqianqian/.cache/chrombert/data/anno/hg38_1kb_gene_meta.tsv
Region embedding source: /mnt/Storage/home/chenqianqian/.cache/chrombert/data/anno/hg38_1kb_region_emb.npy
Gene embeddings saved to: ./output_emb_genes/embs_dict.pkl
Matched gene meta saved to: ./output_emb_genes/overlap_genes_meta.tsv


In [10]:
for key, value in gene_emb_dict.items():
    print(key,value.shape)

ensg00000170921 (768,)
tanc2 (768,)
ensg00000200997 (768,)
dpyd (768,)
snora70 (768,)
tp53 (768,)
brd4 (768,)


In [11]:
# Example with different resolution (2kb)
# If pre-computed embeddings are not available, the model will compute them automatically
gene_emb_dict_2kb = embed_gene(
    gene="ENSG00000170921;TANC2;ENSG00000200997;DPYD;SNORA70;tp53;brd4", # your genes
    odir="./output_emb_genes_2kb", # output directory
    genome="hg38",        # Options: "hg38", "mm10"
    resolution="2kb",     # Options: "1kb", "2kb", "4kb", "200bp"
)

ChromBERT region embedding file not found: /mnt/Storage/home/chenqianqian/.cache/chrombert/data/anno/hg38_2kb_region_emb.npy.
Fallback: load ChromBERT model to compute region embeddings for requested genes.
Your supervised_file does not contain the 'label' column. Please verify whether ground truth column ('label') is required. If it is not needed, you may disregard this message.
Your supervised_file does not contain the 'label' column. Please verify whether ground truth column ('label') is required. If it is not needed, you may disregard this message.
use organisim hg38; max sequence length is 6391


100%|██████████| 7/7 [00:00<00:00,  8.52it/s]

Finished!
Note: All gene names were converted to lowercase for matching.
Gene count summary - requested: 7, matched: 7, not found: 0
Gene meta file: /mnt/Storage/home/chenqianqian/.cache/chrombert/data/anno/hg38_2kb_gene_meta.tsv
Region embedding source: computed by ChromBERT model
Gene embeddings saved to: ./output_emb_genes_2kb/embs_dict.pkl
Matched gene meta saved to: ./output_emb_genes_2kb/overlap_genes_meta.tsv





In [12]:
for key, value in gene_emb_dict_2kb.items():
    print(key,value.shape)

ensg00000170921 (768,)
tanc2 (768,)
ensg00000200997 (768,)
dpyd (768,)
snora70 (768,)
tp53 (768,)
brd4 (768,)


### embed region

In [13]:
# Returns:
# - emb_region: Embedding matrix of shape (N_regions, 768)
# - region_bed: DataFrame with columns [chrom, start, end, build_region_index]
#   build_region_index maps to ChromBERT's reference regions

emb_region, region_bed = embed_region(
    region="../data/CTCF_ENCFF664UGR_sample100.bed", # your region
    odir="./output_emb_region_1kb", # output directory
    genome="hg38",        # Options: "hg38", "mm10"
    resolution="1kb",     # Options: "1kb", "2kb", "4kb", "200bp"
)

Region summary - total: 100, overlapping with ChromBERT: 100 (one region may overlap multiple ChromBERT regions), non-overlapping: 0
Finished!
Focus region summary - total: 100, overlapping with ChromBERT: 100, It is possible for a single region to overlap multiple ChromBERT regions,non-overlapping: 0
Overlapping focus regions BED file: ./output_emb_region_1kb/overlap_region.bed
Non-overlapping focus regions BED file: ./output_emb_region_1kb/no_overlap_region.bed
Overlapping focus region embeddings saved to: ./output_emb_region_1kb/overlap_region_emb.npy


In [14]:
emb_region.shape

(100, 768)

In [15]:
region_bed

Unnamed: 0,chrom,start,end,build_region_index
0,chr1,37989946,37990368,32658
1,chr11,2400199,2400617,289179
2,chr12,6778809,6779319,391108
3,chr12,52980788,52981316,424926
4,chr12,53676021,53676448,425578
...,...,...,...,...
95,chr6,53171843,53172315,1660979
96,chr6,131628105,131628616,1713078
97,chr6,158704189,158704642,1735665
98,chr9,128117589,128118035,2049996


### embed regulator

In [16]:
# Returns:
# - mean_regulator_embs_dict: Dict[regulator_name, mean_embedding], shape (768,)
# - regulator_emb_dict: Dict[regulator_name, per_region_embeddings], shape (N_regions, 768)
# - regions: DataFrame with columns [chrom, start, end, build_region_index]

mean_regulator_embs_dict, regulator_emb_dict, regions = embed_regulator(
    region='../data/CTCF_ENCFF664UGR_sample100.bed', # your region
    regulator="EZH2;BRD4;CTCF;FOXA3;myod1;myF5", # your regulators
    odir="./output_emb_regulator_1kb", # output directory
    genome="hg38",        # Options: "hg38", "mm10"
    resolution="1kb",     # Options: "1kb", "2kb", "4kb", "200bp"
)

Region summary - total: 100, overlapping with ChromBERT: 100 (one region may overlap multiple ChromBERT regions), non-overlapping: 0
Note: All regulator names were converted to lowercase for matching.
Regulator count summary - requested: 6, matched in ChromBERT: 5, not found: 1, not found regulator: ['foxa3']
ChromBERT regulators: /mnt/Storage/home/chenqianqian/.cache/chrombert/data/config/hg38_6k_regulators_list.txt
Your supervised_file does not contain the 'label' column. Please verify whether ground truth column ('label') is required. If it is not needed, you may disregard this message.
Your supervised_file does not contain the 'label' column. Please verify whether ground truth column ('label') is required. If it is not needed, you may disregard this message.
use organisim hg38; max sequence length is 6391


100%|██████████| 2/2 [00:02<00:00,  1.04s/it]

Finished!
Saved mean regulator embeddings to pickle file: ./output_emb_regulator_1kb/mean_regulator_emb.pkl
Saved regulator embeddings to hdf5 file: ./output_emb_regulator_1kb/regulator_emb_on_region.hdf5





In [17]:
for key, value in mean_regulator_embs_dict.items():
    print(key, value.shape)

myod1 (768,)
ctcf (768,)
myf5 (768,)
brd4 (768,)
ezh2 (768,)


In [18]:
mean_regulator_embs_dict["myod1"][0:10]

array([-2.00714844, -0.91044434, -0.62341553, -2.63640625, -0.31753555,
       -0.95142578,  0.05770416, -0.39900879,  0.0566394 ,  1.28871155])

In [19]:
regulator_emb_dict["myod1"].shape

(100, 768)

In [20]:
regions.shape

(100, 4)