# Dataset Exploration
This notebook serves to dive deeper into the properties of our data.

In [2]:
import pandas as pd
from pathlib import Path

%load_ext autoreload
%autoreload 2

In [3]:
# directories we will need
git_root = Path('..')
data_root = git_root / 'data'
assert data_root.exists()

# dataset split files
b1_train_path = data_root / 'splits' / 'batch1_training_filtered.tsv'
b1_val_path = data_root / 'splits' / 'batch1_val_filtered.tsv'
b1_test_path = data_root / 'splits' / 'batch1_test_filtered.tsv'
b2_train_path = data_root / 'splits' / 'batch2_training_filtered.tsv'
b2_val_path = data_root / 'splits' / 'batch2_val_filtered.tsv'
b2_test_path = data_root / 'splits' / 'batch2_test_filtered.tsv'

In [4]:
b1_full_data = data_root / 'preprocessing' / 'classification_median_batch_1.tsv'
b2_full_data = data_root / 'preprocessing' / 'classification_median_batch_2.tsv'

## Barcode - gene pairs
Previously, we noticed during embedding calculation, that not all barcodes have targets/labels all genes, which is required for supervised learning. 
Thus, we analyse the barcode - gene pairs in the data here.
In particular, we want to identify the minimal gene set, that is shared among all barcodes of a batch.

In [None]:
# algoritm idea make iterative intersections.
# 1. start will a set of all uniq genes
# 2. for each uniq barcode intersect the uniq gene ids associated to this barcode
# 3. return the result of all intersections

In [13]:
def find_minimal_gene_set(df: pd.DataFrame):
    uniq_gene_ids = set(df['gene_id'])
    uniq_barodes = set(df['barcode'])
    minimal_gene_set = uniq_gene_ids
    for barcode in uniq_barodes:
        minimal_gene_set = minimal_gene_set.intersection(
            set(df[df['barcode'] == barcode]['gene_id'])
        )
    return minimal_gene_set

In [8]:
uniq_gene_ids = set(b1_df['gene_id'])
len(uniq_gene_ids)
list(uniq_gene_ids)[:10]

['ENSG00000186047',
 'ENSG00000188001',
 'ENSG00000233008',
 'ENSG00000154485',
 'ENSG00000196787',
 'ENSG00000287200',
 'ENSG00000224982',
 'ENSG00000130844',
 'ENSG00000152583',
 'ENSG00000053254']

In [9]:
uniq_barodes = set(b1_df['barcode'])
len(uniq_barodes)
list(uniq_barodes)[:10]

['ACCCTGTTCCAGGAAA-1',
 'TATATCCTCCTGGTCT-1',
 'TTCGGTACAAATTGCT-1',
 'GTACTGGTCATGTGGT-1',
 'CTTGCGCGTTTATGGG-1',
 'GCACTAAGTTTACGTC-1',
 'GCCTTAACATTGACAT-1',
 'TGAAGGATCATTTGCT-1',
 'CTCCTGAGTTGTGACA-1',
 'TTAAGTGTCAGGTCCA-1']

In [10]:
minimal_gene_overlap = uniq_gene_ids
for barcode in uniq_barodes:
    minimal_gene_overlap = minimal_gene_overlap.intersection(
        set(b1_df[b1_df['barcode'] == barcode]['gene_id'])
    )
len(minimal_gene_overlap)

2000

In [None]:
b1_min_gene_set = find_minimal_gene_set(
    b1_df = pd.read_csv(b1_full_data, sep='\t')
)
len(b1_min_gene_set)

In vestigate minimal gene set for batch 2.

In [12]:
b2_df = pd.read_csv(b2_full_data, sep='\t')

In [14]:
b2_min_gene_set = find_minimal_gene_set(b2_df)
len(b2_min_gene_set)

2000

### Minimal gene set of splits
As we see, the minimal gene overlap contains all 2000 genes we selected for our analysis.
Thus, check if the error arrises from the split data.

#### Batch 1 train
Minimal gene overlap set for training split of batch 1.

In [15]:
b1_train_min_gene_set = find_minimal_gene_set(
    pd.read_csv(b1_train_path, sep='\t')
)
len(b1_train_min_gene_set)

0

#### Batch 1 validation
Minimal gene overlap set for training split of batch 1.

In [16]:
b1_val_min_gene_set = find_minimal_gene_set(
    pd.read_csv(b1_val_path, sep='\t')
)
len(b1_val_min_gene_set)

0

#### Batch 1 test
Minimal gene overlap set for training split of batch 1.

In [17]:
b1_test_min_gene_set = find_minimal_gene_set(
    pd.read_csv(b1_test_path, sep='\t')
)
len(b1_test_min_gene_set)

0

#### Batch 2 train

In [18]:
b2_train_min_gene_set = find_minimal_gene_set(
    pd.read_csv(b2_train_path, sep='\t')
)
len(b2_train_min_gene_set)

0

#### Batch 2 validation

In [19]:
b2_val_min_gene_set = find_minimal_gene_set(
    pd.read_csv(b2_val_path, sep='\t')
)
len(b2_val_min_gene_set)

0

#### Batch 2 test

In [20]:
b2_test_min_gene_set = find_minimal_gene_set(
    pd.read_csv(b2_test_path, sep='\t')
)
len(b2_test_min_gene_set)

0

## Dataset class
This section tests the dataset class `CnvDataset` from `src/data/dataset.py`.

In [5]:
import sys
sys.path.append('..') # add the parent directory to system path
from src.data.dataset import CnvDataset

In [13]:
# directories we will need
out_root = git_root / 'out'

# files we will need
genome_fasta = data_root / 'reference' / 'GRCh38.d1.vd1.fa'
assert genome_fasta.exists()
overlap_path = data_root / 'overlap_genes_peaks.tsv'
assert overlap_path.exists()
epiAneufinder_path = out_root / 'epiAneufinder' / 'epiAneuFinder_results.tsv'
assert epiAneufinder_path.exists()

### File format benchmark
We discussed multiple file formats to use in the backend for storing computed embeddings on the disk. Suggestions were the vanilla pytorch format `.pt`, pickle files `.pkl` and the scipy matrix format `.mtx`.
Since pytorch is using pickle in the backend for creating `.pt` files, we decided to only use the `.pt` and `.mtx` formats.
In the following we benchmark reading from these file types, as this could be a bottleneck during training. 

In [7]:
b1_val_path = data_root / 'splits' / 'batch1_val_filtered.tsv'
b1_val_dataset = CnvDataset(
    root=data_root / 'embeddings' / 'batch_1' / 'val',
    data_df=pd.read_csv(b1_val_path, sep='\t'),
    embedding_mode='single_gene_barcode',
    file_format='mtx',
    verbose=2
)

Using 51 barcodes
Using 1093 genes
No embedding files for 932 data points in ../data/embeddings/batch_1/val/single_gene_barcode!
../data/embeddings/batch_1/val/single_gene_barcode/AAAGGTTAGGGTGGAT-1/ENSG00000173372.mtx


In [67]:
b1_val_redo = CnvDataset(
    root=data_root / 'embeddings' / 'batch_1' / 'val_redo_2',
    data_df=pd.read_csv(b1_val_path, sep='\t'),
    fasta_path=genome_fasta,
    atac_path=overlap_path,
    cnv_path=epiAneufinder_path,
    verbose=2,
    force_recompute=True
)

Using 51 barcodes
Using 1093 genes
Recomputing embeddings:  True
[embed]: Iterating over custom barcode to genes mapping
[embed]: Computing 8954 Embeddings with mode: "single_gene_barcode"
[embed]: Using 51 barcodes
[embed]: Using 1093 genes


[embed]: Computing embeddings (# genes done):   0%|                                            | 0/1093 [00:00<?, ?it/s]

emb_df
 Empty DataFrame
Columns: [barcode, gene_id, embedding_path]
Index: []





RuntimeError: No embedding files found!

In [16]:
b1_val_df = pd.read_csv(b1_val_path, sep='\t')

In [21]:
b1_val_df.shape

(8954, 5)

In [20]:
b1_val_df['embedding_path'] = [
    b1_val_redo.ids_to_emb_path(b, g) for b, g in 
    b1_val_df[['barcode', 'gene_id']].itertuples(index=False)
]

In [23]:
missing_df = b1_val_df[~b1_val_df['embedding_path'].isin(b1_val_redo.data_df['embedding_path'])]

In [35]:
missing_df['barcode'].value_counts()

barcode
AATGCATGTTCACCCA-1    39
CTTTAGGCAGCACGAA-1    39
GCGCCTTGTAACAGGG-1    38
ACCCTGTTCATAGCCG-1    37
GACGTAAAGCATGTTA-1    37
TACGGTTAGCACAGCC-1    36
TACTGACAGAATCTCA-1    33
TTCTTAGGTCACACCC-1    30
TGATGAACAAGGCCAA-1    30
TGGCCTTTCTTAATGG-1    27
GATTTGCAGCCTGTTC-1    20
CGCTTAACATCACAGC-1    19
CTGGACCAGTTGGGCC-1    19
CGATTTGCACGAATCC-1    19
CTGGTTACATTTAAGC-1    19
GTAGTTTCAGGCGAGT-1    19
TGCTAGCCAATTAACC-1    19
GCTATCCTCCCTCGCA-1    19
CACTTTGTCTAACTGA-1    19
TACTAAGTCAGCAAAG-1    18
CAAGGTAAGGTCCAAT-1    18
TCGTTAAAGGTTAGCT-1    17
TCTCGCCCAGGCGATA-1    17
CCATAAGGTGGTTATG-1    17
AAAGGTTAGGGTGGAT-1    17
ATTGAAGCAAATGCCC-1    17
ACTATCCGTTTAACGG-1    17
GGGCTAACAGCATGTC-1    16
TAGTAAGCATGTTGCA-1    16
ATAGGTACAGGTTAAA-1    15
GGACCGAAGCGATAAG-1    15
TGTGGCCAGGAGGACT-1    15
TTGGCTACATAAGTTC-1    14
AAGCCTCCACGAACAG-1    13
CAGCCTAAGTGAAGTG-1    13
TCGTTATTCCTTGCAC-1    13
ATAGATGCACCTCACC-1    13
CACAAGCGTAAATTGC-1    13
ACTAACCAGTTGGGCC-1    13
ACCCGTAAGGGACCTC-

In [29]:
b1_val_redo.data_df

Unnamed: 0,barcode,gene_id,expression_count,classification,embedding_path
0,GACGTAAAGCATGTTA-1,ENSG00000069424,0.606885,low,../data/embeddings/batch_1/val_redo/single_gen...
1,CCTTAACGTCGTAAAT-1,ENSG00000215788,0.638693,low,../data/embeddings/batch_1/val_redo/single_gen...
2,GCTATCCTCCCTCGCA-1,ENSG00000215788,0.216336,low,../data/embeddings/batch_1/val_redo/single_gen...
3,TACGGTTAGCACAGCC-1,ENSG00000215788,0.569907,low,../data/embeddings/batch_1/val_redo/single_gen...
4,GATTTGCAGCCTGTTC-1,ENSG00000171621,0.346812,low,../data/embeddings/batch_1/val_redo/single_gen...
...,...,...,...,...,...
8017,AAGCCTCCACGAACAG-1,ENSG00000287918,0.744612,low,../data/embeddings/batch_1/val_redo/single_gen...
8018,ACCCTGTTCATAGCCG-1,ENSG00000160219,0.267540,low,../data/embeddings/batch_1/val_redo/single_gen...
8019,GACGTAAAGCATGTTA-1,ENSG00000160219,0.981858,high,../data/embeddings/batch_1/val_redo/single_gen...
8020,GGTCTTGAGGAGCACG-1,ENSG00000160219,0.728388,low,../data/embeddings/batch_1/val_redo/single_gen...


Check if there are duplicate barcode - gene pairs in the val data frame.

In [30]:
b1_val_df[['barcode', 'gene_id']].drop_duplicates().shape

(8954, 2)

In [31]:
b1_val_df.shape

(8954, 5)

In [32]:
b1_val_df[b1_val_df['barcode'] == 'AAAGGTTAGGGTGGAT-1'].shape

(192, 5)

In [41]:
barcode_to_genes = {
    barcode: list(set(b1_val_df[b1_val_df['barcode'] == barcode]['gene_id']))
    for barcode in set(b1_val_df['barcode'])
}

In [42]:
n_embeddings = sum(map(len, barcode_to_genes.values()))
n_embeddings

8954

In [48]:
cnv_df = pd.read_csv(out_root / 'epiAneufinder/epiAneuFinder_results.tsv', sep=' ')

In [49]:
cnv_df

Unnamed: 0,idx,seq,start,end,GCGCAATGTTGCGGAT-3,CTAGTGAGTCACCTAT-3,AATCATGTCGATCAGT-1,TCCTTAGTCGGGACTC-4,ACTATCCGTCTAACCT-1,GTGCACGGTCACAAAT-3,...,AGTGTGGCAATTATGC-1,AGAGAGGAGCAGGCCT-2,ATCCACCTCAACAAGG-1,GGTTGAGCATAGGCGA-1,GTACTTAAGACACCGC-1,CGTAGCGGTAGGTTGC-1,TGCTATGCAAGGACCA-1,CAGGGCTTCCAAGTGT-2,CAAGTAACAGGTTACC-2,TGATTCAAGTTCCTGC-1
0,1,chr1,800001,900000,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,0,1
1,2,chr1,900001,1000000,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,0,1
2,3,chr1,1000001,1100000,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,0,1
3,4,chr1,1100001,1200000,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,0,1
4,5,chr1,1200001,1300000,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26009,26010,chr22,50100001,50200000,1,0,0,1,1,1,...,1,1,1,0,1,1,2,1,1,1
26010,26011,chr22,50200001,50300000,1,0,0,1,1,1,...,1,1,1,0,1,1,2,1,1,1
26011,26012,chr22,50300001,50400000,1,0,0,1,1,1,...,1,1,1,0,1,1,2,1,1,1
26012,26013,chr22,50400001,50500000,1,0,0,1,1,1,...,1,1,1,0,1,1,2,1,1,1


Debug stepwise the mapping dict reshaping code:

1. check barcodes

In [50]:
uniq_barcodes = set(cnv_df.columns[4:])
len(uniq_barcodes)

5778

In [54]:
b1_val_uniq_barcodes = set(b1_val_df['barcode'])
print(len(b1_val_uniq_barcodes))
len(b1_val_uniq_barcodes.intersection(uniq_barcodes))

51


51

In [55]:
iter_barcode_set = set(barcode_to_genes.keys())
uniq_barcodes = uniq_barcodes.intersection(iter_barcode_set)
len(uniq_barcodes)

51

2. check genes

In [57]:
atac_df = pd.read_csv(data_root / 'overlap_genes_peaks.tsv', sep='\t')
atac_df

Unnamed: 0,Chromosome,Start_peak,End_peak,Start_gene,End_gene,gene_id
0,1,9855,10676,9121,26894,ENSG00000290825
1,1,9855,10676,10010,15670,ENSG00000223972
2,1,181027,181582,87295,183719,ENSG00000241860
3,1,181027,181582,144386,201861,ENSG00000310528
4,1,181027,181582,178931,186937,ENSG00000308415
...,...,...,...,...,...,...
92630,Y,11332807,11333567,11330329,11335595,ENSG00000258991
92631,Y,11333695,11334440,11330329,11335595,ENSG00000258991
92632,Y,11333695,11334440,11333627,11339693,ENSG00000270455
92633,Y,56855391,56856232,56853244,56857488,ENSG00000235857


In [58]:
uniq_gene_ids = set(atac_df['gene_id'].unique())
len(uniq_gene_ids)

36749

In [59]:
iter_gene_set = {
    gene for vals in barcode_to_genes.values() for gene in vals
}
uniq_gene_overlap = uniq_gene_ids.intersection(iter_gene_set)
len(uniq_gene_overlap)

910

In [60]:
b1_val_uniq_gene_ids = set(b1_val_df['gene_id'])
print(len(b1_val_uniq_gene_ids))
print(len(b1_val_uniq_gene_ids.intersection(uniq_gene_ids)))

1093
910


This means, we filter genes based on the atac peaks file.

In [61]:
len([gid for gid in uniq_gene_ids if gid not in uniq_gene_overlap])

35839

In [62]:
new_uniq_gene_ids = uniq_gene_overlap

# sort uniq barcodes for alphabetical iteration order
uniq_barcodes = sorted(list(uniq_barcodes))

# create a mapping from gene to barcode to serve for iteration later
gene_to_barcodes = {gene: list() for gene in new_uniq_gene_ids}
# TODO: use boolean map to encode barcodes per gene for saving memory
for cnv_barcode in uniq_barcodes:
    for gene in barcode_to_genes[cnv_barcode]:
        if gene in new_uniq_gene_ids:
            gene_to_barcodes[gene].append(cnv_barcode)

In [65]:
new_n_embeddings = sum(map(len, gene_to_barcodes.values()))
assert new_n_embeddings == n_embeddings, 'Not some # embeddings: {} != {}'.format(
    new_n_embeddings, n_embeddings
)

AssertionError: Not some # embeddings: 8049 != 8954

In [6]:
b1_val_dataset

<class 'src.data.dataset.CnvDataset'> with 8022 datapoints

In [7]:
b2_val_path = data_root / 'splits' / 'batch2_val_filtered.tsv'
b2_val_dataset = CnvDataset(
    root=data_root / 'embeddings' / 'batch_2',
    data_df=pd.read_csv(b2_val_path, sep='\t'),
    embedding_mode='single_gene_barcode',
    file_format='pt'
)

Using 35 barcodes
Using 621 genes
No embedding files for 3603 data points in ../data/embeddings/batch_2/single_gene_barcode!


In [8]:
b2_val_dataset

<class 'src.data.dataset.CnvDataset'> with 0 datapoints

In [None]:
b1_test_path = b1_val_dataset.data_df['embedding_path'].iloc[42]
b2_test_path = b2_val_dataset.data_df['embedding_path'].iloc[42]
print(b1_test_path)
print(b2_test_path)

../data/embeddings/batch_1/val/single_gene_barcode/AAAGGTTAGGGTGGAT-1/ENSG00000117984.mtx
../data/embeddings/batch_2/single_gene_barcode/AAACCAACATTGCGGT-2/ENSG00000172985.pt


In [None]:
from torch import load as pyt_load
from scipy.io import mmread

In [None]:
%%timeit
t = mmread(b1_test_path)

3.93 ms ± 40.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
t = pyt_load(b2_test_path)

257 μs ± 646 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%%timeit
t = b1_val_dataset[42]

5.16 ms ± 204 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%%timeit
t = b2_val_dataset[42]

511 μs ± 25.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


This means, using the `mtx` format is `# samples * # iterations * 1000 μs` longer

Now, let's gauge the disk space required for only the embedding of specific 
barcode, gene pairs described in the dataset splits.
One embedding has roughly a size of 550Kb:
```sh
(ssb) marw@goodmosquito-0dcaa:~/cmscb8$ ll -h data/embeddings/file_type_test/
-rw-rw-r-- 1 marw student 549K Feb 27 08:03 ENSG00000090104.pt
```

With this, we can extrapolate the required space from the number of embeddings 
per split as an upper bound.
```sh
(ssb) marw@goodmosquito-0dcaa:~/cmscb8$ wc -l data/splits/*
  18636 data/splits/batch1_test_filtered.tsv
  59342 data/splits/batch1_training_filtered.tsv
   8955 data/splits/batch1_val_filtered.tsv
   7329 data/splits/batch2_test_filtered.tsv
  23497 data/splits/batch2_training_filtered.tsv
   3604 data/splits/batch2_val_filtered.tsv
wc: data/splits/old_splits: Is a directory
      0 data/splits/old_splits
 121363 total
```
For each of these files, we need to subtract 1 for the header.
After that we can calculate the expected size in Mb by `# embeddings * 550Kb / 1000`.

In [2]:
# space required for batch 1
(18636 - 1 + 59342 - 1 + 8955 - 1) * 550 / 1000

47811.5

In [3]:
# space required for batch 1
(7329 - 1 + 23497 - 1 + 3604 - 1) * 550 / 1000

18934.85

In [5]:
# space required for all embeddings in Gb
(121363 - 6) * 550 / 1_000_000

66.74635

## Loading Batch 2
Next we try to load batch 2, which is currently saved as `.pt` file in `data/embeddings/batch_2`.

In [None]:
b2_val_data_root = data_root / 'embeddings' / 'batch_2'
b2_train_path = data_root / 'splits' / 'batch2_training_filtered.tsv'
b2_val_path = data_root / 'splits' / 'batch2_val_filtered.tsv'
b2_test_path = data_root / 'splits' / 'batch2_test_filtered.tsv'

In [None]:
b2_val_df = pd.read_csv(b2_val_path, sep='\t')

In [None]:
b2_val_dataset = CnvDataset(
    root=b2_val_data_root,
    data_df=b2_val_df,
    embedding_mode='single_gene_barcode',
    verbose=2,
    file_format='pt'
)

Using 35 barcodes
Using 621 genes
emb_on_disk_df
                   barcode          gene_id  \
0       CCTGTATGTGGTTCTT-2  ENSG00000172578   
1       CCTGTATGTGGTTCTT-2  ENSG00000114450   
2       CCTGTATGTGGTTCTT-2  ENSG00000136297   
3       CCTGTATGTGGTTCTT-2  ENSG00000104783   
4       CCTGTATGTGGTTCTT-2  ENSG00000188676   
...                    ...              ...   
427373  GAGGACTAGTATCGCG-2  ENSG00000287493   
427374  GAGGACTAGTATCGCG-2  ENSG00000122025   
427375  GAGGACTAGTATCGCG-2  ENSG00000125851   
427376  GAGGACTAGTATCGCG-2  ENSG00000267421   
427377  GAGGACTAGTATCGCG-2  ENSG00000285592   

                                           embedding_path  
0       ../data/embeddings/batch_2/single_gene_barcode...  
1       ../data/embeddings/batch_2/single_gene_barcode...  
2       ../data/embeddings/batch_2/single_gene_barcode...  
3       ../data/embeddings/batch_2/single_gene_barcode...  
4       ../data/embeddings/batch_2/single_gene_barcode...  
...                       

In [None]:
b2_val_dataset

<class 'src.data.dataset.CnvDataset'> with 3160 datapoints

In [None]:
b1_val_dataset

<class 'src.data.dataset.CnvDataset'> with 8022 datapoints

In [None]:
%%time
t_list = [b1_val_dataset[i] for i in range(3_100)]

In [None]:
%%time
t_list = [b2_val_dataset[i] for i in range(3_100)]

In [None]:
b2_val_df

Unnamed: 0,barcode,gene_id,expression_count,classification
0,AAACCAACATTGCGGT-2,ENSG00000162512,1.024897,high
1,AAACCAACATTGCGGT-2,ENSG00000186094,1.850096,high
2,AAACCAACATTGCGGT-2,ENSG00000231252,1.024897,high
3,AAACCAACATTGCGGT-2,ENSG00000183023,1.024897,high
4,AAACCAACATTGCGGT-2,ENSG00000115355,1.024897,high
...,...,...,...,...
3598,TTTGTGGCATGAATAG-2,ENSG00000130254,1.790029,high
3599,TTTGTGGCATGAATAG-2,ENSG00000080573,1.790029,high
3600,TTTGTGGCATGAATAG-2,ENSG00000167615,1.790029,high
3601,TTTGTGGCATGAATAG-2,ENSG00000225377,1.790029,high


In [None]:
[(b, g) for b, g in b2_val_df[['barcode', 'gene_id']].itertuples(index=False)][:10]

[('AAACCAACATTGCGGT-2', 'ENSG00000162512'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000186094'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000231252'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000183023'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000115355'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000172005'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000172985'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000152127'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000157827'),
 ('AAACCAACATTGCGGT-2', 'ENSG00000236283')]

In [None]:
b1_val_df['embedding_path'] = [b1_val_dataset.ids_to_emb_path(b, g) for b, g in b1_val_df[['barcode', 'gene_id']].itertuples(index=False)]

In [None]:
b1_val_df[b1_val_df.apply(lambda x: not x['embedding_path'].exists(), axis=1)]

Unnamed: 0,barcode,gene_id,expression_count,classification,embedding_path
0,AAAGGTTAGGGTGGAT-1,ENSG00000173372,0.407756,low,../data/embeddings/batch_1/val/single_gene_bar...
1,AAAGGTTAGGGTGGAT-1,ENSG00000226476,1.103188,high,../data/embeddings/batch_1/val/single_gene_bar...
89,AAAGGTTAGGGTGGAT-1,ENSG00000287523,0.920410,high,../data/embeddings/batch_1/val/single_gene_bar...
97,AAAGGTTAGGGTGGAT-1,ENSG00000038945,0.407756,low,../data/embeddings/batch_1/val/single_gene_bar...
146,AAAGGTTAGGGTGGAT-1,ENSG00000258081,0.920410,high,../data/embeddings/batch_1/val/single_gene_bar...
...,...,...,...,...,...
8949,TTGGCTACATAAGTTC-1,ENSG00000198938,2.065108,high,../data/embeddings/batch_1/val/single_gene_bar...
8950,TTGGCTACATAAGTTC-1,ENSG00000198840,1.721116,high,../data/embeddings/batch_1/val/single_gene_bar...
8951,TTGGCTACATAAGTTC-1,ENSG00000198886,2.611877,high,../data/embeddings/batch_1/val/single_gene_bar...
8952,TTGGCTACATAAGTTC-1,ENSG00000198786,1.907831,high,../data/embeddings/batch_1/val/single_gene_bar...


In [None]:
b1_val_df.loc[0]['embedding_path'].exists()

False

In [None]:
any([p.exists() for p in b2_val_paths])

False

In [None]:
p_dir = b2_val_paths[0].parent
p_name = b2_val_paths[0].name.split('.')[0] + '.pt'
p = p_dir / p_name
print(p)
p.exists()

../data/embeddings/batch_1/val/single_gene_barcode/AAACCAACATTGCGGT-2/ENSG00000162512.pt


False