# Testing Embedding

In [1]:
from pathlib import Path
import sys
sys.path.append('../..')

# add this to you notebook so it automatically reloads code you changed in a python file after importing this code
%load_ext autoreload
%autoreload 2

In [2]:
from src.data.embedding import Embedder

In [3]:
git_root=Path('../..')
gtf_path=git_root / 'data/gene_positions_and_overlaps/gene_positions.csv'
fasta_path=git_root / 'data/reference/GRCh38.d1.vd1.fa'
overlap_path=git_root / 'data/gene_positions_and_overlaps/overlaps_batch1.tsv'
epianeu_path=git_root / 'out/epiAneufinder/epiAneuFinder_results.tsv'
assert all(map(lambda p:p.exists(),
            [gtf_path, fasta_path, overlap_path, epianeu_path]
))

In [4]:
test_genes = [
    'ENSG00000269113',
    'ENSG00000188158',
    'ENSG00000154511',
    'ENSG00000225555'
]
test_gene_set = set(test_genes)
test_barcodes = [
    'AAACCAACATGTCAGC-1',
    'TTGTTTGGTTAATGCG-1',
    'CCCTGTTAGCACGTTG-1'
]
test_barcode_set = set(test_barcodes)
test_barcode_to_genes = {
		'AAACCAACATGTCAGC-1': ['ENSG00000154511'],
		'TTGTTTGGTTAATGCG-1': ['ENSG00000269113', 'ENSG00000154511'],
        'CCCTGTTAGCACGTTG-1': ['ENSG00000269113', 'ENSG00000154511', 'ENSG00000225555']
	}

In [5]:
embedder = Embedder(
    fasta_path=fasta_path,
    gtf_path=gtf_path,
    atac_path=overlap_path,
    cnv_path=epianeu_path,
    barcode_set=test_barcode_set,
    gene_set=test_gene_set,
    barcodes_to_genes=test_barcode_to_genes,
    verbose=True
)



[Embedder]:
 +--------------+-----------+-----------+-----------------+-------+
|   Chromosome |     Start |       End | gene_id         | +3    |
|   (category) |   (int64) |   (int64) | (object)        | ...   |
|--------------+-----------+-----------+-----------------+-------|
|            1 |  92830737 |  92840737 | ENSG00000154511 | ...   |
|            1 |  47758528 |  47768528 | ENSG00000269113 | ...   |
|           21 |  34351099 |  34361099 | ENSG00000225555 | ...   |
+--------------+-----------+-----------+-----------------+-------+
Unstranded PyRanges object has 3 rows and 7 columns from 2 chromosomes.
For printing, the PyRanges was sorted on Chromosome.
3 hidden columns: Gene_Start, Gene_End, Sequence
[Embedder]:
 +--------------+-----------+-----------+-----------------+-------+
|   Chromosome |     Start |       End | gene_id         | +8    |
|   (category) |   (int64) |   (int64) | (object)        | ...   |
|--------------+-----------+-----------+-----------------+---

[Embedder]: Computing embeddings:   0%|                                                           | 0/6 [00:00<?, ?it/s]

In [6]:
for b, g, e in embedder:
    print(b, g)
    print(e.dtype)

[Embedder]: Computing embeddings:  50%|█████████████████████████▌                         | 3/6 [00:00<00:00, 25.12it/s]

AAACCAACATGTCAGC-1 ENSG00000154511
uint8
CCCTGTTAGCACGTTG-1 ENSG00000154511
uint8
TTGTTTGGTTAATGCG-1 ENSG00000154511
uint8
CCCTGTTAGCACGTTG-1 ENSG00000269113
uint8


[Embedder]: Computing embeddings: 100%|███████████████████████████████████████████████████| 6/6 [00:00<00:00, 26.81it/s]

TTGTTTGGTTAATGCG-1 ENSG00000269113
uint8
CCCTGTTAGCACGTTG-1 ENSG00000225555
uint8





Manual test case for embedding which spans two ATAC-seq peaks in cell `AATGCATGTTCACCCA-1` and gene `ENSG00000090104`.

In [7]:
embedder = Embedder(
    fasta_path=fasta_path,
    gtf_path=gtf_path,
    atac_path=overlap_path,
    cnv_path=epianeu_path,
    barcode_set={'AATGCATGTTCACCCA-1'},
    gene_set={'ENSG00000090104'},
    # barcode_to_genes=test_barcode_to_genes,
    verbose=True
)


[Embedder]:
 +--------------+-----------+-----------+-----------------+-------+
|   Chromosome |     Start |       End | gene_id         | +3    |
|   (category) |   (int64) |   (int64) | (object)        | ...   |
|--------------+-----------+-----------+-----------------+-------|
|            1 | 192573763 | 192583763 | ENSG00000090104 | ...   |
+--------------+-----------+-----------+-----------------+-------+
Unstranded PyRanges object has 1 rows and 7 columns from 1 chromosomes.
For printing, the PyRanges was sorted on Chromosome.
3 hidden columns: Gene_Start, Gene_End, Sequence
[Embedder]:
 +--------------+-----------+-----------+-----------------+-------+
|   Chromosome |     Start |       End | gene_id         | +6    |
|   (category) |   (int64) |   (int64) | (object)        | ...   |
|--------------+-----------+-----------+-----------------+-------|
|            1 | 192573763 | 192583763 | ENSG00000090104 | ...   |
+--------------+-----------+-----------+-----------------+----

[Embedder]: Computing embeddings:   0%|                                                           | 0/1 [00:00<?, ?it/s]

In [8]:
b, g, e = next(embedder)

In [9]:
e.shape

(7, 10000)

In [10]:
e.sum(axis=1)

array([5816, 4777, 4789, 5835, 1720,    0,    0], dtype=uint64)

In [11]:
barcode = 'AATGCATGTTCACCCA-1'
gene_id = 'ENSG00000090104'
gene_pr = embedder.gene_pr
atac_pr = embedder.atac_pr
atac_pr_test = atac_pr[[barcode]].intersect(gene_pr[gene_pr.gene_id == gene_id])
sum(atac_pr_test.End - atac_pr_test.Start)

24937

## CNV loss/gain test case
Test case for CNV gain and CNV loss based on read data from the dataset.
Use barcode `AATGCATGTTCACCCA-1` and gene ids `ENSG00000232450` or `ENSG00000127220` for CNV gain.
For CNV loss use barcode `GTGCACGGTCACAAAT-3` and gene ids `ENSG00000230477` or `ENSG00000278902`.

In [12]:
embedder = Embedder(
    fasta_path=fasta_path,
    gtf_path=gtf_path,
    atac_path=overlap_path,
    cnv_path=epianeu_path,
    barcodes_to_genes={
        'AATGCATGTTCACCCA-1': ['ENSG00000232450', 'ENSG00000127220'],
        'GTGCACGGTCACAAAT-3': ['ENSG00000230477', 'ENSG00000278902']
        },
    verbose=True
)


[Embedder]:
 +--------------+-----------+-----------+-----------------+-------+
|   Chromosome |     Start |       End | gene_id         | +3    |
|   (category) |   (int64) |   (int64) | (object)        | ...   |
|--------------+-----------+-----------+-----------------+-------|
|            1 | 113696884 | 113706884 | ENSG00000232450 | ...   |
|            2 |  75596071 |  75606071 | ENSG00000230477 | ...   |
|           18 |  73919939 |  73929939 | ENSG00000278902 | ...   |
|           19 |  17290131 |  17300131 | ENSG00000127220 | ...   |
+--------------+-----------+-----------+-----------------+-------+
Unstranded PyRanges object has 4 rows and 7 columns from 4 chromosomes.
For printing, the PyRanges was sorted on Chromosome.
3 hidden columns: Gene_Start, Gene_End, Sequence
[Embedder]:
 +--------------+-----------+-----------+-----------------+---------+
|   Chromosome |     Start |       End | gene_id         | +5782   |
|   (category) |   (int64) |   (int64) | (object)        |

[Embedder]: Computing embeddings: 100%|███████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.94s/it]


In [13]:
for b, g, e in embedder:
    print(b, g)
    print(e.sum(axis=1))



AATGCATGTTCACCCA-1 ENSG00000232450
[ 8118  7844  7779  8018     0     0 10000]




GTGCACGGTCACAAAT-3 ENSG00000230477
[ 8152  8162  8128  8094     0 10000     0]




GTGCACGGTCACAAAT-3 ENSG00000278902
[ 8134  7850  7803  8137     0 10000     0]


[Embedder]: Computing embeddings: 100%|███████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.49it/s]

AATGCATGTTCACCCA-1 ENSG00000127220
[ 2187  2636  2575  2602     0     0 10000]





Gene IDs not included in our GTF annotation:

* batch 1 training:
```
ENSG00000238042,ENSG00000286365,ENSG00000242753,ENSG00000100234,ENSG00000188511,ENSG00000258216,ENSG00000258473,ENSG00000198804,ENSG00000269028,ENSG00000280560,ENSG00000198763,ENSG00000100077,ENSG00000254138,ENSG00000100368,ENSG00000100311,ENSG00000198886,ENSG00000249988,ENSG00000231993,ENSG00000253988,ENSG00000237838,ENSG00000128284,ENSG00000285875,ENSG00000198712,ENSG00000100100,ENSG00000198786,ENSG00000176177,ENSG00000225465,ENSG00000255028,ENSG00000231128,ENSG00000260983,ENSG00000198840,ENSG00000256287,ENSG00000272755,ENSG00000253983,ENSG00000130487,ENSG00000265975,ENSG00000198888,ENSG00000237356,ENSG00000249335,ENSG00000286164,ENSG00000248461,ENSG00000236501,ENSG00000228113,ENSG00000259995,ENSG00000100351,ENSG00000248359,ENSG00000287523,ENSG00000287682,ENSG00000278107,ENSG00000198727,ENSG00000100336,ENSG00000233005,ENSG00000280007,ENSG00000198899,ENSG00000253766,ENSG00000272736,ENSG00000198938,ENSG00000138964,ENSG00000228135,ENSG00000227220,ENSG00000188130,ENSG00000272354,ENSG00000258081,ENSG00000099937
```

* batch 1 validation:
```
ENSG00000198804,ENSG00000100311,ENSG00000269028,ENSG00000272736,ENSG00000236501,ENSG00000258081,ENSG00000100351,ENSG00000249988,ENSG00000231128,ENSG00000198938,ENSG00000253988,ENSG00000100234,ENSG00000198712,ENSG00000198886,ENSG00000227220,ENSG00000198899,ENSG00000285875,ENSG00000100100,ENSG00000280560,ENSG00000198786,ENSG00000138964,ENSG00000198727,ENSG00000254138,ENSG00000228113,ENSG00000237356,ENSG00000198888,ENSG00000256287,ENSG00000176177,ENSG00000228334,ENSG00000258216,ENSG00000253766,ENSG00000198763,ENSG00000198840,ENSG00000188511,ENSG00000100077,ENSG00000258473,ENSG00000257277,ENSG00000287523
```

`ENSG00000198804|ENSG00000100311|ENSG00000269028|ENSG00000272736|ENSG00000236501|ENSG00000258081|ENSG00000100351|ENSG00000249988|ENSG00000231128|ENSG00000198938|ENSG00000253988|ENSG00000100234|ENSG00000198712|ENSG00000198886|ENSG00000227220|ENSG00000198899|ENSG00000285875|ENSG00000100100|ENSG00000280560|ENSG00000198786|ENSG00000138964|ENSG00000198727|ENSG00000254138|ENSG00000228113|ENSG00000237356|ENSG00000198888|ENSG00000256287|ENSG00000176177|ENSG00000228334|ENSG00000258216|ENSG00000253766|ENSG00000198763|ENSG00000198840|ENSG00000188511|ENSG00000100077|ENSG00000258473|ENSG00000257277|ENSG00000287523`

* batch 1 test:
```
ENSG00000100234,ENSG00000188511,ENSG00000258216,ENSG00000198804,ENSG00000269028,ENSG00000280560,ENSG00000198763,ENSG00000100077,ENSG00000254138,ENSG00000100368,ENSG00000100311,ENSG00000198886,ENSG00000249988,ENSG00000253988,ENSG00000237838,ENSG00000128284,ENSG00000285875,ENSG00000198712,ENSG00000100100,ENSG00000198786,ENSG00000225465,ENSG00000255028,ENSG00000231128,ENSG00000198840,ENSG00000256287,ENSG00000253983,ENSG00000130487,ENSG00000198888,ENSG00000237356,ENSG00000100385,ENSG00000228113,ENSG00000100351,ENSG00000248359,ENSG00000287523,ENSG00000198727,ENSG00000280007,ENSG00000198899,ENSG00000253766,ENSG00000272736,ENSG00000128340,ENSG00000198938,ENSG00000138964,ENSG00000258081
```
* batch 2 train:
```
ENSG00000282111,ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000239282,ENSG00000272650,ENSG00000198727,ENSG00000234688,ENSG00000198712,ENSG00000279345,ENSG00000273729,ENSG00000185386,ENSG00000100234,ENSG00000266908,ENSG00000236501,ENSG00000254028,ENSG00000100385,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000287440,ENSG00000100097,ENSG00000176177,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000133475,ENSG00000015475,ENSG00000235295,ENSG00000198804,ENSG00000235568,ENSG00000258785,ENSG00000198938,ENSG00000138964,ENSG00000283504,ENSG00000269028,ENSG00000100351,ENSG00000280560,ENSG00000184058,ENSG00000267537,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000273319,ENSG00000239498,ENSG00000253983,ENSG00000100368,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000287682,ENSG00000249269,ENSG00000225676,ENSG00000212907,ENSG00000100350,ENSG00000231010,ENSG00000198888
```
* batch 2 val:
```
ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000198727,ENSG00000198712,ENSG00000185386,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000287440,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000015475,ENSG00000235271,ENSG00000198804,ENSG00000235568,ENSG00000198938,ENSG00000138964,ENSG00000269028,ENSG00000280560,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000239498,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000287682,ENSG00000212907,ENSG00000198888
```
* batch 2 test:
```
ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000249335,ENSG00000231846,ENSG00000198727,ENSG00000198712,ENSG00000273729,ENSG00000185386,ENSG00000100234,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000100276,ENSG00000176177,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000015475,ENSG00000198804,ENSG00000235568,ENSG00000198938,ENSG00000138964,ENSG00000269028,ENSG00000100351,ENSG00000280560,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000239498,ENSG00000253983,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000212907,ENSG00000100350,ENSG00000198888
```

## Test case for missing CNV data
In case of missing CNV data, no CNV embedding parts can be calculated, thus ignore these embeddings.
Test candicates: 
```py
('CAGGTGGAGTTGGGCC-1', 'ENSG00000203697'),
('CAGCCTAAGTTAACCA-1', 'ENSG00000203697'),
('GTAGGTGCAGCCTAAC-1', 'ENSG00000203697'),
('GCGGTTGGTAGTAAGA-1', 'ENSG00000183662'),
('TCATGCCTCATTCATC-1', 'ENSG00000183662'),
('TTGTTTGGTACAATGT-1', 'ENSG00000183662'),
('GTAGGCGAGCATGTCG-1', 'ENSG00000112232'),
('CTTAATGAGCCTCTGT-1', 'ENSG00000112232'),
('CGTTAGTAGCATTAGC-1', 'ENSG00000112232'),
```

In [14]:
embedder = Embedder(
    fasta_path=fasta_path,
    gtf_path=gtf_path,
    atac_path=overlap_path,
    cnv_path=epianeu_path,
    barcodes_to_genes={
        'CAGGTGGAGTTGGGCC-1': ['ENSG00000232450', 'ENSG00000203697'],
        'CAGCCTAAGTTAACCA-1': ['ENSG00000230477', 'ENSG00000203697'],
        'GCGGTTGGTAGTAAGA-1': ['ENSG00000183662'],
        'TCATGCCTCATTCATC-1': ['ENSG00000183662']
        },
    verbose=True
)


[Embedder]:
 +--------------+-----------+-----------+-----------------+-------+
|   Chromosome |     Start |       End | gene_id         | +3    |
|   (category) |   (int64) |   (int64) | (object)        | ...   |
|--------------+-----------+-----------+-----------------+-------|
|            1 | 113696884 | 113706884 | ENSG00000232450 | ...   |
|            1 | 223536007 | 223546007 | ENSG00000203697 | ...   |
|            2 |  75596071 |  75606071 | ENSG00000230477 | ...   |
|            3 |  68002247 |  68012247 | ENSG00000183662 | ...   |
+--------------+-----------+-----------+-----------------+-------+
Unstranded PyRanges object has 4 rows and 7 columns from 3 chromosomes.
For printing, the PyRanges was sorted on Chromosome.
3 hidden columns: Gene_Start, Gene_End, Sequence
[Embedder]:
 +--------------+-----------+-----------+-----------------+---------+
|   Chromosome |     Start |       End | gene_id         | +5782   |
|   (category) |   (int64) |   (int64) | (object)        |

[Embedder]: Computing embeddings:   0%|                                                           | 0/6 [00:00<?, ?it/s]

In [15]:
for b, g, e in embedder:
    print(b, g)
    print(e.sum(axis=1))

[Embedder]: Computing embeddings:  17%|████████▌                                          | 1/6 [00:00<00:02,  2.42it/s]

CAGGTGGAGTTGGGCC-1 ENSG00000232450
[8118 7844 7779 8018    0    0    0]


  warn('No CNV data for {} and {}'.format(barcode, gene_id))
  warn('No CNV data for {} and {}'.format(barcode, gene_id))
[Embedder]: Computing embeddings:  33%|█████████████████                                  | 2/6 [00:01<00:03,  1.33it/s]

CAGCCTAAGTTAACCA-1 ENSG00000230477
[8152 8162 8128 8094    0    0    0]


  warn('No CNV data for {} and {}'.format(barcode, gene_id))
  warn('No CNV data for {} and {}'.format(barcode, gene_id))
[Embedder]: Computing embeddings:  33%|█████████████████                                  | 2/6 [00:01<00:03,  1.00it/s]

[Embedder]: skipped 4 embeddings missing CNV data





In [16]:
import pyranges as pr

In [17]:
b1_train_gid_missing = 'ENSG00000238042,ENSG00000286365,ENSG00000242753,ENSG00000100234,ENSG00000188511,ENSG00000258216,ENSG00000258473,ENSG00000198804,ENSG00000269028,ENSG00000280560,ENSG00000198763,ENSG00000100077,ENSG00000254138,ENSG00000100368,ENSG00000100311,ENSG00000198886,ENSG00000249988,ENSG00000231993,ENSG00000253988,ENSG00000237838,ENSG00000128284,ENSG00000285875,ENSG00000198712,ENSG00000100100,ENSG00000198786,ENSG00000176177,ENSG00000225465,ENSG00000255028,ENSG00000231128,ENSG00000260983,ENSG00000198840,ENSG00000256287,ENSG00000272755,ENSG00000253983,ENSG00000130487,ENSG00000265975,ENSG00000198888,ENSG00000237356,ENSG00000249335,ENSG00000286164,ENSG00000248461,ENSG00000236501,ENSG00000228113,ENSG00000259995,ENSG00000100351,ENSG00000248359,ENSG00000287523,ENSG00000287682,ENSG00000278107,ENSG00000198727,ENSG00000100336,ENSG00000233005,ENSG00000280007,ENSG00000198899,ENSG00000253766,ENSG00000272736,ENSG00000198938,ENSG00000138964,ENSG00000228135,ENSG00000227220,ENSG00000188130,ENSG00000272354,ENSG00000258081,ENSG00000099937'.split(',')

In [18]:
b1_val_gid_missing = 'ENSG00000198804,ENSG00000100311,ENSG00000269028,ENSG00000272736,ENSG00000236501,ENSG00000258081,ENSG00000100351,ENSG00000249988,ENSG00000231128,ENSG00000198938,ENSG00000253988,ENSG00000100234,ENSG00000198712,ENSG00000198886,ENSG00000227220,ENSG00000198899,ENSG00000285875,ENSG00000100100,ENSG00000280560,ENSG00000198786,ENSG00000138964,ENSG00000198727,ENSG00000254138,ENSG00000228113,ENSG00000237356,ENSG00000198888,ENSG00000256287,ENSG00000176177,ENSG00000228334,ENSG00000258216,ENSG00000253766,ENSG00000198763,ENSG00000198840,ENSG00000188511,ENSG00000100077,ENSG00000258473,ENSG00000257277,ENSG00000287523'.split(',')

In [19]:
b1_test_gid_missing = 'ENSG00000100234,ENSG00000188511,ENSG00000258216,ENSG00000198804,ENSG00000269028,ENSG00000280560,ENSG00000198763,ENSG00000100077,ENSG00000254138,ENSG00000100368,ENSG00000100311,ENSG00000198886,ENSG00000249988,ENSG00000253988,ENSG00000237838,ENSG00000128284,ENSG00000285875,ENSG00000198712,ENSG00000100100,ENSG00000198786,ENSG00000225465,ENSG00000255028,ENSG00000231128,ENSG00000198840,ENSG00000256287,ENSG00000253983,ENSG00000130487,ENSG00000198888,ENSG00000237356,ENSG00000100385,ENSG00000228113,ENSG00000100351,ENSG00000248359,ENSG00000287523,ENSG00000198727,ENSG00000280007,ENSG00000198899,ENSG00000253766,ENSG00000272736,ENSG00000128340,ENSG00000198938,ENSG00000138964,ENSG00000258081'.split(',')

In [20]:
b1_gid_missing = {
    g for gl in [b1_train_gid_missing, b1_val_gid_missing, b1_test_gid_missing]
    for g in gl
}

In [21]:
sum(map(len, [b1_train_gid_missing, b1_val_gid_missing, b1_test_gid_missing]))

145

In [22]:
len(b1_gid_missing)

68

In [23]:
gtf_pr = pr.read_gtf('../../data/reference/Homo_sapiens.GRCh38.113.gtf.gz')

KeyboardInterrupt: 

In [None]:
b1_gid_notin_gtf = list()
for gid in b1_gid_missing:
    if gtf_pr[(gtf_pr.Feature == 'gene') & (gtf_pr.gene_id == gid)].empty:
        b1_gid_notin_gtf.append(gid)

In [61]:
len(b1_gid_notin_gtf)

38

In [None]:
len(b1_gid_notin_gtf)

75

In [50]:
b2_train_gid_missing = 'ENSG00000282111,ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000239282,ENSG00000272650,ENSG00000198727,ENSG00000234688,ENSG00000198712,ENSG00000279345,ENSG00000273729,ENSG00000185386,ENSG00000100234,ENSG00000266908,ENSG00000236501,ENSG00000254028,ENSG00000100385,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000287440,ENSG00000100097,ENSG00000176177,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000133475,ENSG00000015475,ENSG00000235295,ENSG00000198804,ENSG00000235568,ENSG00000258785,ENSG00000198938,ENSG00000138964,ENSG00000283504,ENSG00000269028,ENSG00000100351,ENSG00000280560,ENSG00000184058,ENSG00000267537,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000273319,ENSG00000239498,ENSG00000253983,ENSG00000100368,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000287682,ENSG00000249269,ENSG00000225676,ENSG00000212907,ENSG00000100350,ENSG00000231010,ENSG00000198888'.split(',')

In [51]:
b2_val_gid_missing = 'ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000198727,ENSG00000198712,ENSG00000185386,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000287440,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000015475,ENSG00000235271,ENSG00000198804,ENSG00000235568,ENSG00000198938,ENSG00000138964,ENSG00000269028,ENSG00000280560,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000239498,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000287682,ENSG00000212907,ENSG00000198888'.split(',')

In [52]:
b2_test_gid_missing = 'ENSG00000054611,ENSG00000285875,ENSG00000237356,ENSG00000269900,ENSG00000249335,ENSG00000231846,ENSG00000198727,ENSG00000198712,ENSG00000273729,ENSG00000185386,ENSG00000100234,ENSG00000215067,ENSG00000198899,ENSG00000198786,ENSG00000188511,ENSG00000100276,ENSG00000176177,ENSG00000260517,ENSG00000196576,ENSG00000255028,ENSG00000015475,ENSG00000198804,ENSG00000235568,ENSG00000198938,ENSG00000138964,ENSG00000269028,ENSG00000100351,ENSG00000280560,ENSG00000100345,ENSG00000198763,ENSG00000198840,ENSG00000239498,ENSG00000253983,ENSG00000100311,ENSG00000287523,ENSG00000198886,ENSG00000212907,ENSG00000100350,ENSG00000198888'.split(',')

In [54]:
b2_gid_missing = {
    g for gl in [b2_train_gid_missing, b2_val_gid_missing, b2_test_gid_missing]
    for g in gl
}

In [55]:
sum(map(len, [b1_train_gid_missing, b1_val_gid_missing, b1_test_gid_missing]))

145

In [56]:
len(b2_gid_missing)

63

In [None]:
b2_gid_notin_gtf = list()
for gid in b2_gid_missing:
    if gtf_pr[(gtf_pr.Feature == 'gene') & (gtf_pr.gene_id == gid)].empty:
        b2_gid_notin_gtf.append(gid)

In [63]:
len(b2_gid_notin_gtf)

27

In [45]:
gtf_pr = gene_pr

In [96]:
gen_pr = gtf_pr[gtf_pr.Feature == 'gene']

In [None]:
gen_pr[['Chromosome', 'Start', 'End', 'gene_id', 'gene_version']]

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,gene_version
0,1,3069167,3438621,+,ENSG00000142611,17
1,1,5492977,5494674,+,ENSG00000260972,1
2,1,10472287,10630758,+,ENSG00000142655,13
3,1,4571480,4594016,+,ENSG00000232596,3
4,1,2425979,2505532,+,ENSG00000149527,18
...,...,...,...,...,...,...
78927,Y,9609955,9632346,-,ENSG00000309329,1
78928,Y,9831417,9846696,-,ENSG00000229208,1
78929,Y,9831077,9858547,-,ENSG00000310196,1
78930,Y,9905594,9908139,-,ENSG00000187657,6


In [98]:
gen_pr[gen_pr.gene_id.isin(b1_gid_missing)]

  df = pd.concat([plus, minus])
  df = pd.concat([plus, minus])


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_version,...,transcript_source,transcript_biotype,tag,transcript_support_level,exon_number,exon_id,exon_version,protein_id,protein_version,ccds_id
0,22,havana_tagene,gene,21712760,21714013,.,+,.,ENSG00000286365,1,...,,,,,,,,,,
1,22,ensembl_havana,gene,20774112,20787720,.,+,.,ENSG00000099937,11,...,,,,,,,,,,
2,22,ensembl_havana,gene,50545898,50551023,.,+,.,ENSG00000130487,9,...,,,,,,,,,,
3,22,ensembl_havana,gene,36913627,36940439,.,+,.,ENSG00000100368,15,...,,,,,,,,,,
4,22,ensembl_havana,gene,25564674,25729294,.,+,.,ENSG00000100077,16,...,,,,,,,,,,
5,22,ensembl_havana,gene,39901083,39973721,.,+,.,ENSG00000100351,17,...,,,,,,,,,,
6,22,ensembl_havana,gene,32801704,32863041,.,+,.,ENSG00000100234,12,...,,,,,,,,,,
7,22,ensembl_havana,gene,44172955,44219533,.,+,.,ENSG00000138964,18,...,,,,,,,,,,
8,22,ensembl_havana,gene,39743043,39893864,.,-,.,ENSG00000176177,10,...,,,,,,,,,,
9,22,ensembl_havana,gene,39223358,39244982,.,-,.,ENSG00000100311,17,...,,,,,,,,,,


In [99]:
gen_pr[gen_pr.gene_id.isin(b2_gid_missing)]

  df = pd.concat([plus, minus])
  df = pd.concat([plus, minus])


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_version,...,transcript_source,transcript_biotype,tag,transcript_support_level,exon_number,exon_id,exon_version,protein_id,protein_version,ccds_id
0,22,ensembl_havana,gene,46762616,47175699,.,+,.,ENSG00000054611,14,...,,,,,,,,,,
1,22,ensembl_havana,gene,36913627,36940439,.,+,.,ENSG00000100368,15,...,,,,,,,,,,
2,22,ensembl_havana,gene,39901083,39973721,.,+,.,ENSG00000100351,17,...,,,,,,,,,,
3,22,ensembl_havana,gene,37675635,37679802,.,+,.,ENSG00000100097,12,...,,,,,,,,,,
4,22,ensembl_havana,gene,19756702,19783593,.,+,.,ENSG00000184058,16,...,,,,,,,,,,
5,22,ensembl_havana,gene,32801704,32863041,.,+,.,ENSG00000100234,12,...,,,,,,,,,,
6,22,ensembl_havana,gene,44172955,44219533,.,+,.,ENSG00000138964,18,...,,,,,,,,,,
7,22,havana,gene,36703576,36753050,.,+,.,ENSG00000234688,4,...,,,,,,,,,,
8,22,havana,gene,18029281,18044412,.,+,.,ENSG00000235295,3,...,,,,,,,,,,
9,22,havana,gene,49805451,49807208,.,+,.,ENSG00000279345,1,...,,,,,,,,,,


In [None]:
from src.data import standard_chromosomes
standard_chromosomes.append('22')

In [None]:
gen_pr = gen_pr[gen_pr.Chromosome.isin(standard_chromosomes)][['Chromosome', 'Start', 'End', 'gene_id']].unstrand()

In [94]:
gen_pr

Unnamed: 0,Chromosome,Start,End,gene_id
0,1,3069167,3438621,ENSG00000142611
1,1,5492977,5494674,ENSG00000260972
2,1,10472287,10630758,ENSG00000142655
3,1,4571480,4594016,ENSG00000232596
4,1,2425979,2505532,ENSG00000149527
...,...,...,...,...
78682,Y,9609955,9632346,ENSG00000309329
78683,Y,9831417,9846696,ENSG00000229208
78684,Y,9831077,9858547,ENSG00000310196
78685,Y,9905594,9908139,ENSG00000187657


In [95]:
gen_pr.to_csv('../../data/reference/Homo_sapiens.GRCh38.113.gene_only.csv')