In [18]:
from pathlib import Path
from matplotlib import pyplot as plt
import pickle

from dimelo import parse_bam, load_processed

# Base input and output directories
test_data_dir = Path('./data')
output_dir = test_data_dir / 'test_targets'

output_dir.mkdir(exist_ok=True)

region = 'chr14:17376348-17378348'

# Input files
ctcf_bam_file = test_data_dir / 'ctcf_demo.sorted.bam'
ctcf_guppy_bam_file = test_data_dir / 'winnowmap_guppy_merge_subset.updated.bam'
ctcf_target_regions = test_data_dir / 'ctcf_demo_peak.bed'
ctcf_off_target_regions = test_data_dir / 'ctcf_demo_not_peak.bed'
ref_genome_file = Path('./output/chm13.draft_v1.0.fasta')
ctcf_bam_file_updated = Path('./output/ctcf_demo.updated.bam')

## Generate Pileups

In [11]:
pileup_file, pileup_regions = parse_bam.pileup(
    input_file=ctcf_bam_file_updated,
    output_name='megalodon_merged_regions',
    ref_genome=ref_genome_file,
    output_directory=output_dir,
    regions=[ctcf_target_regions,ctcf_off_target_regions],
    motifs=['A,0','CG,0'],
    thresh=190,
    window_size=1000,
)
pileup_file, pileup_regions = parse_bam.pileup(
    input_file=ctcf_bam_file_updated,
    output_name='megalodon_one_region',
    ref_genome=ref_genome_file,
    output_directory=output_dir,
    regions=region,
    motifs=['A,0','CG,0'],
    thresh=190,
    window_size=None,
)

No specified number of cores requested. 8 available on machine, allocating all.
Modification threshold of 190 assumed to be for range 0-255. 190/255=0.7450980392156863 will be sent to modkit.


          | Preprocessing   0% | 00:00

          | Processing ctcf_demo.updated.bam   0% | 00:00<?

          |    0%

No specified number of cores requested. 8 available on machine, allocating all.
Modification threshold of 190 assumed to be for range 0-255. 190/255=0.7450980392156863 will be sent to modkit.


          | Preprocessing   0% | 00:00

          | Processing ctcf_demo.updated.bam   0% | 00:00<?

          |    0%

## Generate Read Extractions

In [10]:
extract_file, extract_regions = parse_bam.extract(
    input_file=ctcf_bam_file_updated,
    output_name='megalodon_one_region',
    ref_genome=ref_genome_file,
    output_directory=output_dir,
    regions=region,
    motifs=['A,0','CG,0'],
    thresh=190,
)
extract_file_no_thresh, extract_regions = parse_bam.extract(
    input_file=ctcf_bam_file_updated,
    output_name='megalodon_one_region_no_threshold',
    ref_genome=ref_genome_file,
    output_directory=output_dir,
    regions=region,
    motifs=['A,0','CG,0'],
    thresh=None,
)

No specified number of cores requested. 8 available on machine, allocating all.
Modification threshold of 190 assumed to be for range 0-255. 190/255=0.7450980392156863 will be sent to modkit.


          | Preprocessing   0% | 00:00

          | Processing ctcf_demo.updated.bam   0% | 00:00<?

          |    0%

          | Transferring 24 from reads.A,0.txt into reads.combined_basemods.h5, new size 24   0% | 00:00<?

          | Preprocessing   0% | 00:00

          | Processing ctcf_demo.updated.bam   0% | 00:00<?

          |    0%

          | Transferring 24 from reads.CG,0.txt into reads.combined_basemods.h5, new size 48   0% | 00:00<?

No specified number of cores requested. 8 available on machine, allocating all.
No valid base modification threshold provided. Raw probs will be saved.


          | Preprocessing   0% | 00:00

          | Processing ctcf_demo.updated.bam   0% | 00:00<?

          |    0%

          | Transferring 24 from reads.A,0.txt into reads.combined_basemods.h5, new size 24   0% | 00:00<?

          | Preprocessing   0% | 00:00

          | Processing ctcf_demo.updated.bam   0% | 00:00<?

          |    0%

          | Transferring 24 from reads.CG,0.txt into reads.combined_basemods.h5, new size 48   0% | 00:00<?

## Save Read Data

In [22]:
read_data_list, datasets, _ = load_processed.read_vectors_from_hdf5(
    file=extract_file, # binarized modification calls
    regions=region,
    motifs=['CG,0','A,0'],
    sort_by = ['chromosome','read_start','read_name']
)
read_data_dict = {}
# Print out the data from the first read
for idx,dataset in enumerate(datasets):
    for read_data in read_data_list:
        read_data_dict[dataset] = read_data[idx]
        break
with open(output_dir / f'first_read_{region.replace(":","-")}_binarized_190.pkl', 'wb') as f:
    pickle.dump(read_data_dict, f)

In [23]:
read_data_list, datasets, _ = load_processed.read_vectors_from_hdf5(
    file=extract_file_no_thresh, # raw modification probabilities
    regions=region,
    motifs=['CG,0','A,0'],
    sort_by = ['chromosome','read_start','read_name']
)
read_data_dict = {}
# Print out the data from the first read
for idx,dataset in enumerate(datasets):
    for read_data in read_data_list:
        read_data_dict[dataset] = read_data[idx]
        break
with open(output_dir / f'first_read_{region.replace(":","-")}_probs.pkl', 'wb') as f:
    pickle.dump(read_data_dict, f)