In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from genome import get_chromosome_lengths, parse_gff, get_chromosome_valid_genes
from utils import get_gene_embeddings, get_normalized_gene_expression
import json
import numpy as np
import os
from tqdm import tqdm

In [6]:
chrom_lengths = get_chromosome_lengths("../data/genome/fasta_file.fsa")
cds_coords = parse_gff("../data/genome/gff_file.gff")

In [7]:
condition_samples = json.load(open("../data/samples.json"))
conditions = sorted(list(condition_samples.keys()))
samples = sorted([s for c_s in condition_samples.values() for s in c_s])

FileNotFoundError: [Errno 2] No such file or directory: '../data/samples.json'

In [8]:
sample_expression = {
    sample: {
        "+": np.load(f"../data/waern_2013/{sample}.sense_bp1.npz"),
        "-": np.load(f"../data/waern_2013/{sample}.antisense_bp1.npz"),
    }
    for sample in samples
}

NameError: name 'samples' is not defined

In [6]:
chr_valid_genes = get_chromosome_valid_genes(cds_coords, chrom_lengths)

In [7]:
for chromosome, length in chrom_lengths.items():
    print(f"Processing {chromosome}")

    target_file = f"../data/prepared/{chromosome}.npz"

    if os.path.exists(target_file):
        print(f"Skipping {chromosome} because it already exists")
        continue

    chromosome_embedding = np.load(f"../data/embeddings/{chromosome}.npy").astype(np.float16)

    assert len(chromosome_embedding) == length, f"Length of {chromosome} embedding is {len(chromosome_embedding)} but should be {length}"

    valid_genes = chr_valid_genes[chromosome]
    valid_cds_coords = [cds_coords[gene] for gene in valid_genes]

    gene_embeddings = get_gene_embeddings(valid_cds_coords, chromosome_embedding)
    del chromosome_embedding
    gene_expression = get_normalized_gene_expression(valid_cds_coords, condition_samples, sample_expression)

    np.savez_compressed(target_file, X=gene_embeddings, Y=gene_expression)

Processing chrI
Skipping chrI because it already exists
Processing chrII
Skipping chrII because it already exists
Processing chrIII
Skipping chrIII because it already exists
Processing chrIV
Skipping chrIV because it already exists
Processing chrV
Skipping chrV because it already exists
Processing chrVI
Skipping chrVI because it already exists
Processing chrVII
Skipping chrVII because it already exists
Processing chrVIII
Skipping chrVIII because it already exists
Processing chrIX
Skipping chrIX because it already exists
Processing chrX
Skipping chrX because it already exists
Processing chrXI
Skipping chrXI because it already exists
Processing chrXII
Skipping chrXII because it already exists
Processing chrXIII
Skipping chrXIII because it already exists
Processing chrXIV
Skipping chrXIV because it already exists
Processing chrXV
Skipping chrXV because it already exists
Processing chrXVI
Skipping chrXVI because it already exists
Processing chrM
Skipping chrM because it already exists


In [8]:
for chr, genes in chr_valid_genes.items():
    print(f"Processing {chr}")

    arr = np.load(f"../data/prepared/{chr}.npz")
    X = arr["X"]
    Y = arr["Y"]
    
    assert X.shape[0] == len(genes), f"X.shape[0] = {X.shape[0]} != {len(genes)}"
    assert Y.shape[0] == len(genes), f"Y.shape[0] = {Y.shape[0]} != {len(genes)}"
    
    for i, gene in enumerate(tqdm(genes)):
        X_gene = X[i]
        Y_gene = Y[i]
        
        target_file = f"../data/genes/{gene}.npz"
        if not os.path.exists(target_file):
            np.savez_compressed(target_file, X=X_gene, Y=Y_gene)

Processing chrI


100%|██████████| 118/118 [00:00<00:00, 96402.00it/s]


Skipping YAL068W-A because it already exists
Skipping YAL068C because it already exists
Skipping YAL067W-A because it already exists
Skipping YAL067C because it already exists
Skipping YAL066W because it already exists
Skipping YAL065C because it already exists
Skipping YAL064W-B because it already exists
Skipping YAL064C-A because it already exists
Skipping YAL064W because it already exists
Skipping YAL063C-A because it already exists
Skipping YAL063C because it already exists
Skipping YAL062W because it already exists
Skipping YAL061W because it already exists
Skipping YAL060W because it already exists
Skipping YAL059C-A because it already exists
Skipping YAL059W because it already exists
Skipping YAL058W because it already exists
Skipping YAL056C-A because it already exists
Skipping YAL056W because it already exists
Skipping YAL055W because it already exists
Skipping YAL054C because it already exists
Skipping YAL053W because it already exists
Skipping YAL051W because it already exis

100%|██████████| 462/462 [02:37<00:00,  2.93it/s]


Processing chrIII


100%|██████████| 185/185 [01:02<00:00,  2.95it/s]


Processing chrIV


100%|██████████| 855/855 [05:32<00:00,  2.57it/s]


Processing chrV


100%|██████████| 328/328 [02:09<00:00,  2.53it/s]


Processing chrVI


100%|██████████| 141/141 [00:59<00:00,  2.38it/s]


Processing chrVII


100%|██████████| 595/595 [03:41<00:00,  2.69it/s]


Processing chrVIII


100%|██████████| 327/327 [01:48<00:00,  3.01it/s]


Processing chrIX


100%|██████████| 244/244 [01:27<00:00,  2.80it/s]


Processing chrX


100%|██████████| 404/404 [02:33<00:00,  2.64it/s]


Processing chrXI


100%|██████████| 348/348 [02:12<00:00,  2.63it/s]


Processing chrXII


100%|██████████| 587/587 [03:46<00:00,  2.59it/s]


Processing chrXIII


100%|██████████| 515/515 [03:16<00:00,  2.63it/s]


Processing chrXIV


100%|██████████| 441/441 [02:41<00:00,  2.73it/s]


Processing chrXV


100%|██████████| 607/607 [04:00<00:00,  2.52it/s]


Processing chrXVI


100%|██████████| 520/520 [03:20<00:00,  2.60it/s]


Processing chrM


100%|██████████| 28/28 [00:09<00:00,  2.87it/s]
