In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from genome import get_chromosome_lengths, parse_gff, get_chromosome_valid_genes
from utils import get_gene_embeddings, get_normalized_gene_expression
import json
import numpy as np
import os

In [3]:
chrom_lengths = get_chromosome_lengths("../data/genome/fasta_file.fsa")
cds_coords = parse_gff("../data/genome/gff_file.gff")

In [4]:
condition_samples = json.load(open("../data/samples.json"))
conditions = sorted(list(condition_samples.keys()))
samples = sorted([s for c_s in condition_samples.values() for s in c_s])

In [5]:
sample_expression = {
    sample: {
        "+": np.load(f"../data/waern_2013/{sample}.sense_bp1.npz"),
        "-": np.load(f"../data/waern_2013/{sample}.antisense_bp1.npz"),
    }
    for sample in samples
}

In [6]:
chr_valid_genes = get_chromosome_valid_genes(cds_coords, chrom_lengths)

for chromosome, length in chrom_lengths.items():
    print(f"Processing {chromosome}")

    target_file = f"../data/prepared/{chromosome}.npz"

    if os.path.exists(target_file):
        print(f"Skipping {chromosome} because it already exists")
        continue

    chromosome_embedding = np.load(f"../data/embeddings/{chromosome}.npy").astype(np.float16)

    assert len(chromosome_embedding) == length, f"Length of {chromosome} embedding is {len(chromosome_embedding)} but should be {length}"

    valid_genes = chr_valid_genes[chromosome]
    valid_cds_coords = [cds_coords[gene] for gene in valid_genes]

    gene_embeddings = get_gene_embeddings(valid_cds_coords, chromosome_embedding)
    del chromosome_embedding
    gene_expression = get_normalized_gene_expression(valid_cds_coords, condition_samples, sample_expression)

    np.savez_compressed(target_file, X=gene_embeddings, Y=gene_expression)

Processing chrI
Skipping chrI because it already exists
Processing chrII
Skipping chrII because it already exists
Processing chrIII
Skipping chrIII because it already exists
Processing chrIV


100%|██████████| 855/855 [00:00<00:00, 1125.35it/s]
100%|██████████| 855/855 [12:46<00:00,  1.12it/s]


Processing chrV


100%|██████████| 328/328 [00:00<00:00, 1105.35it/s]
100%|██████████| 328/328 [02:01<00:00,  2.70it/s]


Processing chrVI


100%|██████████| 141/141 [00:00<00:00, 947.74it/s]
100%|██████████| 141/141 [00:24<00:00,  5.75it/s]


Processing chrVII


100%|██████████| 595/595 [00:00<00:00, 1033.41it/s]
100%|██████████| 595/595 [06:57<00:00,  1.42it/s]


Processing chrVIII


100%|██████████| 327/327 [00:00<00:00, 976.05it/s] 
100%|██████████| 327/327 [01:55<00:00,  2.82it/s]


Processing chrIX


100%|██████████| 244/244 [00:00<00:00, 830.32it/s]
100%|██████████| 244/244 [01:07<00:00,  3.61it/s]


Processing chrX


100%|██████████| 404/404 [00:00<00:00, 945.13it/s]
100%|██████████| 404/404 [03:13<00:00,  2.09it/s]


Processing chrXI


100%|██████████| 348/348 [00:00<00:00, 545.21it/s] 
100%|██████████| 348/348 [02:27<00:00,  2.35it/s]


Processing chrXII


100%|██████████| 587/587 [00:00<00:00, 1018.70it/s]
100%|██████████| 587/587 [06:41<00:00,  1.46it/s]


Processing chrXIII


100%|██████████| 515/515 [00:00<00:00, 594.17it/s] 
100%|██████████| 515/515 [05:08<00:00,  1.67it/s]


Processing chrXIV


100%|██████████| 441/441 [00:00<00:00, 813.35it/s]
100%|██████████| 441/441 [03:43<00:00,  1.98it/s]


Processing chrXV


100%|██████████| 607/607 [00:00<00:00, 965.80it/s] 
100%|██████████| 607/607 [06:54<00:00,  1.46it/s]


Processing chrXVI


100%|██████████| 520/520 [00:00<00:00, 1020.34it/s]
100%|██████████| 520/520 [05:14<00:00,  1.65it/s]


Processing chrM


100%|██████████| 28/28 [00:00<00:00, 988.72it/s]
100%|██████████| 28/28 [00:02<00:00, 10.14it/s]


In [7]:
sum_genes = 0
for chromosome, valid_genes in chr_valid_genes.items():
    sum_genes += len(valid_genes)

print(sum_genes)

6705
