# Testing Embedding

In [1]:
from pathlib import Path
import sys
sys.path.append('../..')

# add this to you notebook so it automatically reloads code you changed in a python file after importing this code
%load_ext autoreload
%autoreload 2

In [2]:
from src.data.embedding import embed, Embedder

In [3]:
git_root=Path('../..')
gtf_path=git_root / 'data/gene_positions_and_overlaps/gene_positions.csv'
fasta_path=git_root / 'data/reference/GRCh38.d1.vd1.fa'
overlap_path=git_root / 'data/gene_positions_and_overlaps/overlaps_batch1.tsv'
epianeu_path=git_root / 'out/epiAneufinder/epiAneuFinder_results.tsv'
assert all(map(lambda p:p.exists(),
            [gtf_path, fasta_path, overlap_path, epianeu_path]
))

In [15]:
test_genes = [
    'ENSG00000269113',
    'ENSG00000188158',
    'ENSG00000154511',
    'ENSG00000225555'
]
test_gene_set = set(test_genes)
test_barcodes = [
    'AAACCAACATGTCAGC-1',
    'TTGTTTGGTTAATGCG-1',
    'CCCTGTTAGCACGTTG-1'
]
test_barcode_set = set(test_barcodes)
test_barcode_to_genes = {
		'AAACCAACATGTCAGC-1': ['ENSG00000154511'],
		'TTGTTTGGTTAATGCG-1': ['ENSG00000269113', 'ENSG00000154511'],
        'CCCTGTTAGCACGTTG-1': ['ENSG00000269113', 'ENSG00000154511', 'ENSG00000225555']
	}

In [32]:
embedder = Embedder(
    fasta_path=fasta_path,
    gtf_path=gtf_path,
    atac_path=overlap_path,
    cnv_path=epianeu_path,
    barcode_set=test_barcode_set,
    gene_set=test_gene_set,
    barcode_to_genes=test_barcode_to_genes,
    verbose=True
)



[embed]:
 +--------------+--------------+------------+-----------------+-------+
|   Chromosome |   Gene_Start |   Gene_End | gene_id         | +3    |
|   (category) |      (int64) |    (int64) | (object)        | ...   |
|--------------+--------------+------------+-----------------+-------|
|            1 |     92832737 |   92961522 | ENSG00000154511 | ...   |
|            1 |     47760528 |   47997385 | ENSG00000269113 | ...   |
|           21 |     34353099 |   34375809 | ENSG00000225555 | ...   |
+--------------+--------------+------------+-----------------+-------+
Unstranded PyRanges object has 3 rows and 7 columns from 2 chromosomes.
For printing, the PyRanges was sorted on Chromosome.
3 hidden columns: Start, End, Sequence
[embed]:
 +--------------+--------------+------------+-----------------+-------+
|   Chromosome |   Gene_Start |   Gene_End | gene_id         | +8    |
|   (category) |      (int64) |    (int64) | (object)        | ...   |
|--------------+--------------+--

[embed]: Computing embeddings:   0%|            | 0/6 [00:00<?, ?it/s]

In [33]:
for b, g, e in embedder:
    print(b, g)
    print(e.dtype)

[embed]: Computing embeddings:  17%|▋   | 1/6 [00:04<00:20,  4.10s/it]

[embed]: Computing embeddings: 100%|████| 6/6 [00:04<00:00,  1.41it/s]

[embed]: get_dna_embedding() for ENSG00000154511
AAACCAACATGTCAGC-1 ENSG00000154511
uint8
CCCTGTTAGCACGTTG-1 ENSG00000154511
uint8
TTGTTTGGTTAATGCG-1 ENSG00000154511
uint8
[embed]: get_dna_embedding() for ENSG00000269113
CCCTGTTAGCACGTTG-1 ENSG00000269113
uint8
TTGTTTGGTTAATGCG-1 ENSG00000269113
uint8
[embed]: get_dna_embedding() for ENSG00000225555
CCCTGTTAGCACGTTG-1 ENSG00000225555
uint8





In [31]:
embedder.gene_pr.gene_id.iloc[0]

'ENSG00000154511'