# Testing Embedding

In [1]:
from pathlib import Path
import sys
sys.path.append('../..')

# add this to you notebook so it automatically reloads code you changed in a python file after importing this code
%load_ext autoreload
%autoreload 2

In [2]:
from src.data.embedding import Embedder

In [3]:
git_root=Path('../..')
gtf_path=git_root / 'data/gene_positions_and_overlaps/gene_positions.csv'
fasta_path=git_root / 'data/reference/GRCh38.d1.vd1.fa'
overlap_path=git_root / 'data/gene_positions_and_overlaps/overlaps_batch1.tsv'
epianeu_path=git_root / 'out/epiAneufinder/epiAneuFinder_results.tsv'
assert all(map(lambda p:p.exists(),
            [gtf_path, fasta_path, overlap_path, epianeu_path]
))

In [4]:
test_genes = [
    'ENSG00000269113',
    'ENSG00000188158',
    'ENSG00000154511',
    'ENSG00000225555'
]
test_gene_set = set(test_genes)
test_barcodes = [
    'AAACCAACATGTCAGC-1',
    'TTGTTTGGTTAATGCG-1',
    'CCCTGTTAGCACGTTG-1'
]
test_barcode_set = set(test_barcodes)
test_barcode_to_genes = {
		'AAACCAACATGTCAGC-1': ['ENSG00000154511'],
		'TTGTTTGGTTAATGCG-1': ['ENSG00000269113', 'ENSG00000154511'],
        'CCCTGTTAGCACGTTG-1': ['ENSG00000269113', 'ENSG00000154511', 'ENSG00000225555']
	}

In [32]:
embedder = Embedder(
    fasta_path=fasta_path,
    gtf_path=gtf_path,
    atac_path=overlap_path,
    cnv_path=epianeu_path,
    barcode_set=test_barcode_set,
    gene_set=test_gene_set,
    barcode_to_genes=test_barcode_to_genes,
    verbose=True
)



[embed]:
 +--------------+--------------+------------+-----------------+-------+
|   Chromosome |   Gene_Start |   Gene_End | gene_id         | +3    |
|   (category) |      (int64) |    (int64) | (object)        | ...   |
|--------------+--------------+------------+-----------------+-------|
|            1 |     92832737 |   92961522 | ENSG00000154511 | ...   |
|            1 |     47760528 |   47997385 | ENSG00000269113 | ...   |
|           21 |     34353099 |   34375809 | ENSG00000225555 | ...   |
+--------------+--------------+------------+-----------------+-------+
Unstranded PyRanges object has 3 rows and 7 columns from 2 chromosomes.
For printing, the PyRanges was sorted on Chromosome.
3 hidden columns: Start, End, Sequence
[embed]:
 +--------------+--------------+------------+-----------------+-------+
|   Chromosome |   Gene_Start |   Gene_End | gene_id         | +8    |
|   (category) |      (int64) |    (int64) | (object)        | ...   |
|--------------+--------------+--

[embed]: Computing embeddings:   0%|            | 0/6 [00:00<?, ?it/s]

In [33]:
for b, g, e in embedder:
    print(b, g)
    print(e.dtype)

[embed]: Computing embeddings:  17%|▋   | 1/6 [00:04<00:20,  4.10s/it]

[embed]: Computing embeddings: 100%|████| 6/6 [00:04<00:00,  1.41it/s]

[embed]: get_dna_embedding() for ENSG00000154511
AAACCAACATGTCAGC-1 ENSG00000154511
uint8
CCCTGTTAGCACGTTG-1 ENSG00000154511
uint8
TTGTTTGGTTAATGCG-1 ENSG00000154511
uint8
[embed]: get_dna_embedding() for ENSG00000269113
CCCTGTTAGCACGTTG-1 ENSG00000269113
uint8
TTGTTTGGTTAATGCG-1 ENSG00000269113
uint8
[embed]: get_dna_embedding() for ENSG00000225555
CCCTGTTAGCACGTTG-1 ENSG00000225555
uint8





In [31]:
embedder.gene_pr.gene_id.iloc[0]

'ENSG00000154511'

Manual test case for embedding which spans two ATAC-seq peaks in cell `AATGCATGTTCACCCA-1` and gene `ENSG00000090104`.

In [21]:
embedder = Embedder(
    fasta_path=fasta_path,
    gtf_path=gtf_path,
    atac_path=overlap_path,
    cnv_path=epianeu_path,
    barcode_set={'AATGCATGTTCACCCA-1'},
    gene_set={'ENSG00000090104'},
    # barcode_to_genes=test_barcode_to_genes,
    verbose=True
)


[Embedder]:
 +--------------+--------------+------------+-----------------+-------+
|   Chromosome |   Gene_Start |   Gene_End | gene_id         | +3    |
|   (category) |      (int64) |    (int64) | (object)        | ...   |
|--------------+--------------+------------+-----------------+-------|
|            1 |    192575763 |  192580024 | ENSG00000090104 | ...   |
+--------------+--------------+------------+-----------------+-------+
Unstranded PyRanges object has 1 rows and 7 columns from 1 chromosomes.
For printing, the PyRanges was sorted on Chromosome.
3 hidden columns: Start, End, Sequence
[Embedder]:
 +--------------+--------------+------------+-----------------+-------+
|   Chromosome |   Gene_Start |   Gene_End | gene_id         | +6    |
|   (category) |      (int64) |    (int64) | (object)        | ...   |
|--------------+--------------+------------+-----------------+-------|
|            1 |    192575763 |  192580024 | ENSG00000090104 | ...   |
+--------------+------------

[Embedder]: Computing embeddings: 100%|██████████████████████████████████████████████████| 1/1 [10:35<00:00, 635.63s/it]


In [22]:
b, g, e = next(embedder)

[Embedder]: Computing embeddings: 100%|███████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.29s/it]

In [23]:
e.shape

(7, 10000)

In [24]:
e.sum(axis=1)

array([5816, 4777, 4789, 5835, 1720,    0,    0], dtype=uint64)

In [None]:
barcode = 'AATGCATGTTCACCCA-1'
gene_id = 'ENSG00000090104'
gene_pr = embedder.gene_pr
atac_pr = embedder.atac_pr
atac_pr_test = atac_pr[[barcode]].intersect(gene_pr[gene_pr.gene_id == gene_id])
sum(atac_pr_test.End - atac_pr_test.Start)

TODO: create an test case for CNV gain and CNV loss based on read data from the dataset.
Use barcode `AATGCATGTTCACCCA-1` and gene ids `ENSG00000232450` or `ENSG00000127220` for CNV gain.
For CNV loss use barcode `GTGCACGGTCACAAAT-3` and gene ids `ENSG00000230477` or `ENSG00000278902`.

In [32]:
embedder = Embedder(
    fasta_path=fasta_path,
    gtf_path=gtf_path,
    atac_path=overlap_path,
    cnv_path=epianeu_path,
    barcodes_to_genes={
        'AATGCATGTTCACCCA-1': ['ENSG00000232450', 'ENSG00000127220'],
        'GTGCACGGTCACAAAT-3': ['ENSG00000230477', 'ENSG00000278902']
        },
    verbose=True
)


[Embedder]:
 +--------------+--------------+------------+-----------------+-------+
|   Chromosome |   Gene_Start |   Gene_End | gene_id         | +3    |
|   (category) |      (int64) |    (int64) | (object)        | ...   |
|--------------+--------------+------------+-----------------+-------|
|            1 |    113698884 |  113699631 | ENSG00000232450 | ...   |
|            2 |     75598071 |   75598559 | ENSG00000230477 | ...   |
|           18 |     73921939 |   73922631 | ENSG00000278902 | ...   |
|           19 |     17292131 |   17310236 | ENSG00000127220 | ...   |
+--------------+--------------+------------+-----------------+-------+
Unstranded PyRanges object has 4 rows and 7 columns from 4 chromosomes.
For printing, the PyRanges was sorted on Chromosome.
3 hidden columns: Start, End, Sequence
[Embedder]:
 +--------------+--------------+------------+-----------------+---------+
|   Chromosome |   Gene_Start |   Gene_End | gene_id         | +5783   |
|   (category) |      (i



In [33]:
for b, g, e in embedder:
    print(b, g)
    print(e.sum(axis=1))



AATGCATGTTCACCCA-1 ENSG00000232450
[ 8118  7844  7779  8018     0     0 10000]




GTGCACGGTCACAAAT-3 ENSG00000230477
[ 8152  8162  8128  8094     0 10000     0]




GTGCACGGTCACAAAT-3 ENSG00000278902
[ 8134  7850  7803  8137     0 10000     0]


[Embedder]: Computing embeddings: 100%|███████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.29s/it]

AATGCATGTTCACCCA-1 ENSG00000127220
[ 2187  2636  2575  2602     0     0 10000]





Gene IDs not included in our GTF annotation:

* batch 1 training:
```
ENSG00000238042,ENSG00000286365,ENSG00000242753,ENSG00000100234,ENSG00000188511,ENSG00000258216,ENSG00000258473,ENSG00000198804,ENSG00000269028,ENSG00000280560,ENSG00000198763,ENSG00000100077,ENSG00000254138,ENSG00000100368,ENSG00000100311,ENSG00000198886,ENSG00000249988,ENSG00000231993,ENSG00000253988,ENSG00000237838,ENSG00000128284,ENSG00000285875,ENSG00000198712,ENSG00000100100,ENSG00000198786,ENSG00000176177,ENSG00000225465,ENSG00000255028,ENSG00000231128,ENSG00000260983,ENSG00000198840,ENSG00000256287,ENSG00000272755,ENSG00000253983,ENSG00000130487,ENSG00000265975,ENSG00000198888,ENSG00000237356,ENSG00000249335,ENSG00000286164,ENSG00000248461,ENSG00000236501,ENSG00000228113,ENSG00000259995,ENSG00000100351,ENSG00000248359,ENSG00000287523,ENSG00000287682,ENSG00000278107,ENSG00000198727,ENSG00000100336,ENSG00000233005,ENSG00000280007,ENSG00000198899,ENSG00000253766,ENSG00000272736,ENSG00000198938,ENSG00000138964,ENSG00000228135,ENSG00000227220,ENSG00000188130,ENSG00000272354,ENSG00000258081,ENSG00000099937
```

* batch 1 validation:
```
ENSG00000198804,ENSG00000100311,ENSG00000269028,ENSG00000272736,ENSG00000236501,ENSG00000258081,ENSG00000100351,ENSG00000249988,ENSG00000231128,ENSG00000198938,ENSG00000253988,ENSG00000100234,ENSG00000198712,ENSG00000198886,ENSG00000227220,ENSG00000198899,ENSG00000285875,ENSG00000100100,ENSG00000280560,ENSG00000198786,ENSG00000138964,ENSG00000198727,ENSG00000254138,ENSG00000228113,ENSG00000237356,ENSG00000198888,ENSG00000256287,ENSG00000176177,ENSG00000228334,ENSG00000258216,ENSG00000253766,ENSG00000198763,ENSG00000198840,ENSG00000188511,ENSG00000100077,ENSG00000258473,ENSG00000257277,ENSG00000287523
```

`ENSG00000198804|ENSG00000100311|ENSG00000269028|ENSG00000272736|ENSG00000236501|ENSG00000258081|ENSG00000100351|ENSG00000249988|ENSG00000231128|ENSG00000198938|ENSG00000253988|ENSG00000100234|ENSG00000198712|ENSG00000198886|ENSG00000227220|ENSG00000198899|ENSG00000285875|ENSG00000100100|ENSG00000280560|ENSG00000198786|ENSG00000138964|ENSG00000198727|ENSG00000254138|ENSG00000228113|ENSG00000237356|ENSG00000198888|ENSG00000256287|ENSG00000176177|ENSG00000228334|ENSG00000258216|ENSG00000253766|ENSG00000198763|ENSG00000198840|ENSG00000188511|ENSG00000100077|ENSG00000258473|ENSG00000257277|ENSG00000287523`

* batch 1 test:
```
ENSG00000100234,ENSG00000188511,ENSG00000258216,ENSG00000198804,ENSG00000269028,ENSG00000280560,ENSG00000198763,ENSG00000100077,ENSG00000254138,ENSG00000100368,ENSG00000100311,ENSG00000198886,ENSG00000249988,ENSG00000253988,ENSG00000237838,ENSG00000128284,ENSG00000285875,ENSG00000198712,ENSG00000100100,ENSG00000198786,ENSG00000225465,ENSG00000255028,ENSG00000231128,ENSG00000198840,ENSG00000256287,ENSG00000253983,ENSG00000130487,ENSG00000198888,ENSG00000237356,ENSG00000100385,ENSG00000228113,ENSG00000100351,ENSG00000248359,ENSG00000287523,ENSG00000198727,ENSG00000280007,ENSG00000198899,ENSG00000253766,ENSG00000272736,ENSG00000128340,ENSG00000198938,ENSG00000138964,ENSG00000258081
```
* batch 2 train:
```
ENSG00000282111,ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000239282,ENSG00000272650,ENSG00000198727,ENSG00000234688,ENSG00000198712,ENSG00000279345,ENSG00000273729,ENSG00000185386,ENSG00000100234,ENSG00000266908,ENSG00000236501,ENSG00000254028,ENSG00000100385,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000287440,ENSG00000100097,ENSG00000176177,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000133475,ENSG00000015475,ENSG00000235295,ENSG00000198804,ENSG00000235568,ENSG00000258785,ENSG00000198938,ENSG00000138964,ENSG00000283504,ENSG00000269028,ENSG00000100351,ENSG00000280560,ENSG00000184058,ENSG00000267537,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000273319,ENSG00000239498,ENSG00000253983,ENSG00000100368,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000287682,ENSG00000249269,ENSG00000225676,ENSG00000212907,ENSG00000100350,ENSG00000231010,ENSG00000198888
```
* batch 2 val:
```
ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000198727,ENSG00000198712,ENSG00000185386,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000287440,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000015475,ENSG00000235271,ENSG00000198804,ENSG00000235568,ENSG00000198938,ENSG00000138964,ENSG00000269028,ENSG00000280560,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000239498,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000287682,ENSG00000212907,ENSG00000198888
```
* batch 2 test:
```ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000249335,ENSG00000231846,ENSG00000198727,ENSG00000198712,ENSG00000273729,ENSG00000185386,ENSG00000100234,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000100276,ENSG00000176177,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000015475,ENSG00000198804,ENSG00000235568,ENSG00000198938,ENSG00000138964,ENSG00000269028,ENSG00000100351,ENSG00000280560,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000239498,ENSG00000253983,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000212907,ENSG00000100350,ENSG00000198888
```

In [34]:
import pyranges as pr

In [37]:
b1_train_gid_missing = 'ENSG00000238042,ENSG00000286365,ENSG00000242753,ENSG00000100234,ENSG00000188511,ENSG00000258216,ENSG00000258473,ENSG00000198804,ENSG00000269028,ENSG00000280560,ENSG00000198763,ENSG00000100077,ENSG00000254138,ENSG00000100368,ENSG00000100311,ENSG00000198886,ENSG00000249988,ENSG00000231993,ENSG00000253988,ENSG00000237838,ENSG00000128284,ENSG00000285875,ENSG00000198712,ENSG00000100100,ENSG00000198786,ENSG00000176177,ENSG00000225465,ENSG00000255028,ENSG00000231128,ENSG00000260983,ENSG00000198840,ENSG00000256287,ENSG00000272755,ENSG00000253983,ENSG00000130487,ENSG00000265975,ENSG00000198888,ENSG00000237356,ENSG00000249335,ENSG00000286164,ENSG00000248461,ENSG00000236501,ENSG00000228113,ENSG00000259995,ENSG00000100351,ENSG00000248359,ENSG00000287523,ENSG00000287682,ENSG00000278107,ENSG00000198727,ENSG00000100336,ENSG00000233005,ENSG00000280007,ENSG00000198899,ENSG00000253766,ENSG00000272736,ENSG00000198938,ENSG00000138964,ENSG00000228135,ENSG00000227220,ENSG00000188130,ENSG00000272354,ENSG00000258081,ENSG00000099937'.split(',')

In [39]:
b1_val_gid_missing = 'ENSG00000198804,ENSG00000100311,ENSG00000269028,ENSG00000272736,ENSG00000236501,ENSG00000258081,ENSG00000100351,ENSG00000249988,ENSG00000231128,ENSG00000198938,ENSG00000253988,ENSG00000100234,ENSG00000198712,ENSG00000198886,ENSG00000227220,ENSG00000198899,ENSG00000285875,ENSG00000100100,ENSG00000280560,ENSG00000198786,ENSG00000138964,ENSG00000198727,ENSG00000254138,ENSG00000228113,ENSG00000237356,ENSG00000198888,ENSG00000256287,ENSG00000176177,ENSG00000228334,ENSG00000258216,ENSG00000253766,ENSG00000198763,ENSG00000198840,ENSG00000188511,ENSG00000100077,ENSG00000258473,ENSG00000257277,ENSG00000287523'.split(',')

In [40]:
b1_test_gid_missing = 'ENSG00000100234,ENSG00000188511,ENSG00000258216,ENSG00000198804,ENSG00000269028,ENSG00000280560,ENSG00000198763,ENSG00000100077,ENSG00000254138,ENSG00000100368,ENSG00000100311,ENSG00000198886,ENSG00000249988,ENSG00000253988,ENSG00000237838,ENSG00000128284,ENSG00000285875,ENSG00000198712,ENSG00000100100,ENSG00000198786,ENSG00000225465,ENSG00000255028,ENSG00000231128,ENSG00000198840,ENSG00000256287,ENSG00000253983,ENSG00000130487,ENSG00000198888,ENSG00000237356,ENSG00000100385,ENSG00000228113,ENSG00000100351,ENSG00000248359,ENSG00000287523,ENSG00000198727,ENSG00000280007,ENSG00000198899,ENSG00000253766,ENSG00000272736,ENSG00000128340,ENSG00000198938,ENSG00000138964,ENSG00000258081'.split(',')

In [38]:
gene_pr = pr.read_gtf('../../data/reference/Homo_sapiens.GRCh38.113.gtf.gz')

In [43]:
sum(map(len, [b1_train_gid_missing, b1_val_gid_missing, b1_test_gid_missing]))

145

In [42]:
b1_gid_notin_gtf = list()
for gid_l in [b1_train_gid_missing, b1_val_gid_missing, b1_test_gid_missing]:
    for gid in gid_l:
        if gene_pr[gene_pr.gene_id == gid].empty:
            b1_gid_notin_gtf.append(gid)

In [44]:
len(b1_gid_notin_gtf)

75