In [3]:
import glob
import os
import itertools
from collections import defaultdict

import pandas as pd
import numpy as np
from doit.tools import register_doit_as_IPython_magic
register_doit_as_IPython_magic()

from oudelaar_tiled_capture_2019 import config, doit_tasks
datasets = config.datasets()
doit_db = os.path.join(config.base_folder, '.doit.db')
doit_opts = '--db-file {} --backend sqlite3'.format(doit_db)

In [4]:
def bam_sample_name(bam_path):
    return os.path.basename(os.path.dirname(bam_path))

def bam_build(bam_path):
    return os.path.basename(bam_path).split('.')[1]

def swap_ext(input_filename, new_ext):
    """Remove the last extension from input_filename and replace it with new_ext"""
    
    root, ext = os.path.splitext(input_filename)
    return root + new_ext

In [5]:
final_peaks_table_path = config.in_data_processed('atac/fetal_liver_atac_counts.{}.table')

In [6]:
final_peaks_table = {build: pd.read_csv(final_peaks_table_path.format(build), sep='\t')
                     for build in ('mm9', 'mm10')}

In [7]:
scaling_factors = {build: (1e6 / final_peaks_table[build].iloc[:,4:].sum())
                     for build in ('mm9', 'mm10')}

In [8]:
genome_file = {build: config.in_data_raw('annotation/{}.chrom.sizes'.format(build))
                     for build in ('mm9', 'mm10')}

In [9]:
merged_bams = glob.glob(
    config.in_data_intermediate(
        'atac/merged_technical_reps/*/filtered.*.sorted.bam'))

In [10]:
bedgraph_dir = config.in_data_intermediate('hub/atac')

def bedgraph_path_from_bam(bam):
    sample_name = bam_sample_name(bam)
    build = bam_build(bam)
    return os.path.join(bedgraph_dir, '{}.{}.bg'.format(sample_name, build))

def bigwig_path_from_bam(bam):
    return swap_ext(bedgraph_path_from_bam(bam), '.bw')

In [11]:
def task_generate_bedgraphs():
    for bam in merged_bams:
        sample_name = bam_sample_name(bam)
        build = bam_build(bam)
        scaling_factor = scaling_factors[build][sample_name]
        build_genome = genome_file[build]
        
        yield {
            'name': '{}-{}'.format(sample_name, build),
            'file_dep': [bam],
            'targets': [bedgraph_path_from_bam(bam)],
            'actions': [
                'bedtools genomecov -bg -scale {scaling_factor:.5f} '
                '-g {genome_file} -ibam %(dependencies)s '
                '| sort -k1,1 -k2,2n > %(targets)s'.format(
                    scaling_factor=scaling_factor,
                    genome_file=build_genome),
            ]
        }

In [12]:
def sample_details_from_path(filepath):
    fname = os.path.basename(filepath)
    sample_id, build, ext = fname.split('.')
    parts = sample_id.split('_')
    library_ids = parts.pop()
    expt_ids = parts.pop()
    sample_name = '_'.join(parts)
    return sample_name, expt_ids, library_ids, build
    

def linked_bw_filename(bw_path):
    sample_name, expt_ids, library_ids, build = sample_details_from_path(bw_path)
    return '{}_{}.{}.bw'.format(sample_name, expt_ids, build)

In [13]:
def get_merged_exptid(expt_list):
    if len(expt_list) == 1:
        return expt_list[0]
    else:
        expt_nos = sorted([int(e[4:]) for e in expt_list])

        return 'expts_{}'.format('-'.join([str(s) for s in expt_nos]))

In [14]:
def task_generate_bigwigs():

    bedgraphs_to_merge = {
        'mm9': defaultdict(list),
        'mm10': defaultdict(list)
    }
    
    for t in task_generate_bedgraphs():

        bedgraph_path = t['targets'][0]
        sample_name, expt_ids, library_ids, build = sample_details_from_path(bedgraph_path)
        bedgraphs_to_merge[build][sample_name].append(bedgraph_path)

    for build, build_to_merge in bedgraphs_to_merge.items():
        
        build_genome = genome_file[build]
        
        for sample_name, sample_bedgraphs in build_to_merge.items():
            
            expt_ids = [sample_details_from_path(bg)[1] for bg in sample_bedgraphs]
            
            merged_expt_id = get_merged_exptid(expt_ids)
            
            target_bw = os.path.join(bedgraph_dir, '{}_{}.{}.bw'.format(
                sample_name, merged_expt_id, build))
            
            if len(sample_bedgraphs) > 1:
        
                yield {
                    'name': '{}-{}'.format(sample_name, build),
                    'file_dep': sample_bedgraphs,
                    'targets': [target_bw],
                    'actions': [
                        "bedtools unionbedg -i %(dependencies)s | "
                        "awk '{sum = 0; for (i = 4; i <= NF; i++) sum += $i; sum /= (NF - 3); "
                        "print $1\"\t\" $2\"\t\" $3\"\t\" sum}' > %(targets)s.temp.bg",

                        'bedGraphToBigWig '
                        '%(targets)s.temp.bg {genome_file} %(targets)s'.format(
                            genome_file=build_genome),

                        'rm %(targets)s.temp.bg'
                    ]            
                }
            
            else:
                
                yield {
                    'name': '{}-{}'.format(sample_name, build),
                    'file_dep': sample_bedgraphs,
                    'targets': [target_bw],
                    'actions': [
                        'bedGraphToBigWig '
                        '%(dependencies)s {genome_file} %(targets)s'.format(
                            genome_file=build_genome)
                    ]            
                }
            
            

In [15]:
def task_link_bigwigs():
    for t in task_generate_bigwigs():
        
        final_bw = t['targets'][0]
        bw_fname = os.path.basename(final_bw)
        
        link_path = os.path.join(
            os.environ['PUBLIC_DIR'], 'bigwigs', bw_fname)
        
        yield {
            'name': bw_fname,
            'file_dep': [final_bw],
            'targets': [link_path],
            'actions': ['ln -s %(dependencies)s %(targets)s']
        }

In [20]:
bigwig_cols = {
    'HSPC': '127,205,187',
    'LMPP': '65,182,196',
    'MPP2': '29,145,192',
    'MPP3': '34,94,168',
    'Lin': '2,129,138',
    'S0_CD71_low': '252,141,89',
    'S0_CD71_medium': '252,141,89',
    'S1': '239,101,72',
    'S2': '215,48,31',
    'S3': '179,0,0',
}

bigwig_text = """
    track {sample}
    parent {tissue}-ATAC-seq on
    shortLabel {sample}
    longLabel {long_sample}
    autoScale off
    viewLimits 0:10
    maxHeightPixels 100:100:8
    type bigWig
    bigDataUrl http://userweb.molbiol.ox.ac.uk{link_path}
    color {color}
        
"""

composite_text = """
track {tissue}-ATAC-seq
compositeTrack on
type bigWig
shortLabel {tissue}-ATAC-seq
longLabel Primary mouse {long_tissue} ATAC-seq

"""

tissue_samples = {
    'FL': ['Lin', 'S0_CD71_low', 'S0_CD71_medium', 'S1', 'S2', 'S3'],
    'BM': ['HSPC', 'LMPP', 'MPP2', 'MPP3']
}

def make_bigwig_hub_file(dependencies, targets):
    
    output_file_path, = targets
    
    with open(output_file_path, 'w') as output_file:
        
        for tissue, long_tissue in ('FL', 'fetal liver'), ('BM', 'bone marrow'):
            
            print(composite_text.format(
                        tissue=tissue, long_tissue=long_tissue),
                  file=output_file)

            for linked_bw in sorted(dependencies):

                long_sample = os.path.basename(linked_bw).split('.')[0]

                sample = long_sample.split('_expt')[0]
                
                if sample not in tissue_samples[tissue]:
                    continue

                print(
                    bigwig_text.format(
                        tissue=tissue,
                        sample=sample,
                        long_sample=long_sample,
                        link_path=linked_bw,
                        color=bigwig_cols[sample]), 
                    file=output_file)
            
def task_bigwig_hub_file():
    
    linked_bigwigs = [t['targets'][0] for t in task_link_bigwigs()]
    
    for build in 'mm9', 'mm10':
        
        build_bigwigs = [bw_path for bw_path in linked_bigwigs
                         if build in bw_path]
        
        yield {
            'name': 'Making hub ({})'.format(build),
            'targets': [config.in_data_processed(
                'hub/tracks_file/atac-bigwig.{build}.txt'.format(build=build))],
            'file_dep': build_bigwigs,
            'actions': [make_bigwig_hub_file],
        }

In [21]:
%doit -n 10

-- generate_bedgraphs:S2_expt13_RBAT37-RBAT38-mm9
-- generate_bedgraphs:S2_expt13_RBAT37-RBAT38-mm10
-- generate_bedgraphs:S0_CD71_medium_expt10_RBAT44-RBAT45-mm10
-- generate_bedgraphs:S0_CD71_medium_expt10_RBAT44-RBAT45-mm9
-- generate_bedgraphs:LMPP_expt93_RBAT95-mm9
-- generate_bedgraphs:LMPP_expt93_RBAT95-mm10
-- generate_bedgraphs:S0_CD71_low_expt35_RBAT52-mm10
-- generate_bedgraphs:S0_CD71_low_expt35_RBAT52-mm9
-- generate_bedgraphs:S1_expt10_RBAT46-RBAT47-mm10
-- generate_bedgraphs:S1_expt10_RBAT46-RBAT47-mm9
-- generate_bedgraphs:Lin_expt35_RBAT61-RBAT62-mm10
-- generate_bedgraphs:Lin_expt35_RBAT61-RBAT62-mm9
-- generate_bedgraphs:HSPC_expt93_RBAT94-mm9
-- generate_bedgraphs:HSPC_expt93_RBAT94-mm10
-- generate_bedgraphs:S3_expt13_RBAT39-RBAT40-mm9
-- generate_bedgraphs:S3_expt13_RBAT39-RBAT40-mm10
-- generate_bedgraphs:MPP3_expt93_RBAT97-mm9
-- generate_bedgraphs:MPP3_expt93_RBAT97-mm10
-- generate_bedgraphs:S3_expt35_RBAT59-RBAT60-mm9
-- generate_bedgraphs:S3_expt35_RBAT59-RB