In [1]:
import numpy as np
import h5py
import os
import re
import pandas as pd
from tqdm import tqdm
from ipywidgets import interact
import ipywidgets as widgets

import torch

In [17]:
# Define a mapping from one-hot encoded arrays to nucleotide symbols
def decode_one_hot(one_hot_sequence, start_pos=None, window_size=None):
    nucleotide_map = ['A', 'C', 'G', 'T']

    if start_pos is not None and window_size is not None:
        one_hot_sequence = one_hot_sequence[start_pos:start_pos + window_size]

    decoded_sequence = ''.join(nucleotide_map[np.argmax(base)] for base in one_hot_sequence)
    return decoded_sequence



def get_files_by_size(directory, file_size=None):
    """
    Get a list of files in a directory that match the given size.

    :param directory: The directory to search in.
    :param file_size: The size to filter files by (in bytes).
    :return: List of files that match the given size.
    """
    # List to store the files with the given size
    matching_files = []
    all_files = []

    # Iterate over the files in the directory
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        
        # Check if it's a file and get its size
        if os.path.isfile(filepath):
            size = os.path.getsize(filepath)
            all_files.append((filepath, size))
            # If the file size matches, add it to the list
            if file_size is not None and size == file_size:
                matching_files.append(filepath)

    return all_files, matching_files


def take_sequence_hash(seq):
    nucleotide_counts = tuple(seq.sum(0).astype(int))
    return hash(nucleotide_counts)

In [14]:
RIGHT_FILESIZE = 12583040

MOUSE_DIR = "/grand/TFXcan/imlab/data/enformer_training_data/larger_window/mouse"
file_sizes, good_files = get_files_by_size(MOUSE_DIR, RIGHT_FILESIZE)
# good_files = set(good_files)

In [2]:
MOUSE_OLD_HDF5 = "/grand/TFXcan/imlab/data/enformer_training_data/basenji_data_h5/delete/mouse_dataset.hdf5"
# MOUSE_OLD_HDF5 = "/grand/TFXcan/imlab/data/enformer_training_data/basenji_data_h5/delete/delete_no_groups/train_mouse_dataset.hdf5"

In [3]:
hash_seq_list = []
nucleotide_counts = []

with h5py.File(MOUSE_OLD_HDF5, "r") as old_hdf:
    original_targets = old_hdf['sequence']
    for i in tqdm(range(len(original_targets))):
        seq = original_targets[i]                
        _hash = take_sequence_hash(seq)
        nucleotide_counts.append( seq.sum(0).astype(int) )
        hash_seq_list.append(_hash)

KeyError: "Unable to open object (object 'sequence' doesn't exist)"

In [21]:
hash_seq_set = set(hash_seq_list)

In [100]:
@interact
def check_if_hash_exists(i=widgets.IntSlider(min=0,max=20000)):
    seq = 
    return take_sequence_hash(seq) in seq_list

interactive(children=(IntSlider(value=0, description='i', max=20000), Output()), _dom_classes=('widget-interac…

In [106]:
hash_exists = [take_sequence_hash(np.load(file)[131072:-131072]) in hash_seq_set for file in tqdm(good_files[:1000])]


  0%|                                                                                                                                                                                     | 0/1000 [00:00<?, ?it/s][A
  0%|▌                                                                                                                                                                            | 3/1000 [00:00<00:47, 20.91it/s][A
  1%|█                                                                                                                                                                            | 6/1000 [00:00<01:19, 12.57it/s][A
  1%|█▍                                                                                                                                                                           | 8/1000 [00:00<01:18, 12.58it/s][A
  1%|█▋                                                                                                                                    

In [60]:
# [hash(make_seq_hashable(seq)) for seq in some_sequences]
[ take_sequence_hash(seq) for seq in some_sequences ]

[7574279344516428760,
 -6834450307252527843,
 3963172501341738105,
 2647729470064277247,
 6441987939110553890,
 -62933402501961128,
 8299908931160087141,
 2471088745688136644,
 1167875606036686394,
 4589044634838691870,
 -6941526654509227729,
 943459576720639891,
 1612096440763670181,
 1650271106063072131,
 4875382045157906193,
 8581750594978365613,
 -3816918543525168295,
 -3591124916119430379,
 -4658059302722799406,
 -8216078498327602062,
 -4289154267789523136,
 -7342058213172736030,
 -7271916529618691568,
 289241419266744339,
 -7976142340491518585,
 -8840207838700835108,
 3807037374449910194,
 5928076758559553419,
 1577321436514535261,
 -6645337218227834248,
 -6483343684032290691,
 1985218834199124935,
 3945887768398188284,
 -3919846094699169908,
 4132159156550018134,
 -3815938296083212886,
 2765743574990542283,
 5353628608761248535,
 4454029152618829633,
 -7702325817056986813,
 -4206881520841708236,
 -3647894651257435063,
 44342806594808787,
 6195045268489979726,
 -55147912530391690

In [124]:
files_df = pd.DataFrame(
    [re.sub(".npy|.*chr", "", file).split("_") + [file] for file in good_files],
    columns=["chromosome", "start", "end", "file"]
)

files_df.chromosome = files_df.chromosome.astype(int)
files_df.start = files_df.start.astype(int)
files_df.end = files_df.end.astype(int)

In [128]:
file_sizes_dict = dict(file_sizes)

In [134]:
files_df = files_df.sort_values(["chromosome", "start"])
files_df['filesize'] = files_df.file.apply(lambda x: file_sizes_dict[x])

In [165]:
counts = np.load(files_df.file[5])[131072:-131072].sum(0).astype(int)

In [168]:
os.path.getsize(files_df.file[5])

12583040

In [170]:
np.load(files_df.file[5]).shape

(393216, 4)

In [167]:
counts

array([24215, 16980, 19852, 24796])

In [166]:
min(((np.array(nucleotide_counts) - counts)**2).sum(1))

511603493

In [4]:
kk = torch.load("/grand/TFXcan/imlab/data/enformer_training_data/delete/basenji_data_pt/mouse/train-1-0.pt")

In [12]:
hashes = [ hash(tuple(x)) for x in kk['sequence'].sum(1).numpy().astype(int) ]

In [23]:
[ h in hash_seq_set for h in hashes ]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,