In [1]:
#!/usr/bin/env python
# coding: utf-8


import argparse
from joblib import Parallel, delayed
import numpy as np
import os
import sys

sys.path.append("/dors/capra_lab/users/fongsl/tools/py_/")
sys.path.append("/dors/capra_lab/users/fongsl/tools/genome/")

import config_readwrite as crw
import chr_functions
import split_filename

In [None]:
"""
Output: .tsv with sequence id,  GC count, and GC density

Input: .bed file.

Functions: 
- bedtools shuffle. 

Notes
- length match
- chromosome match


"""

###
#   arguments
###

arg_parser = argparse.ArgumentParser(description= "compute sequence identity between two species")

arg_parser.add_argument("b","--bedfile", help='.bed file in species 1 coordinates')
arg_parser.add_argument("i","--iters", help='number of shuffles')
arg_parser.add_argument("g","--genome_build", help='genome_build')
arg_parser.add_argument("inc","--include", help='regions to include in shuffle')


args = arg_parser.parse_args()

TEST_BED = args.bedfile
ITERS = args.iters
BUILD = args.genome_build
INCLUDE = args.include

In [29]:
TEST_BED = "/data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/global_cis_trans/cis+trans-defined_regions/cis_HH-vs-MH.regions.bed"
ITERS = 0
BUILD = "hg38"
INCLUDE = "/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/bkgd_sharedAcc_regions/code-0000_merged.bed"

In [None]:
PATH, FILENAME, SAMPLE_ID = split_filename.split_filename(TEST_BED) 
SHUF_PATH = "/data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/bkgd_sharedAcc_regions"

if os.path.exists(SHUF_PATH) is False:
    os.mkdir(SHUF_PATH)

In [43]:
###
#   functions
###


def loadConstants(build):  
    path_dict = {
                'hg19': ("/dors/capra_lab/users/fongsl/data/hg19_blacklist_gap_ensemblexon.bed", "/dors/capra_lab/data/dna/human/hg19/hg19_trim.chrom.sizes"),
                'hg38': ("/dors/capra_lab/users/fongsl/data/hg38_blacklist_gap_ensemblexon.bed", "/dors/capra_lab/data/dna/human/hg38/hg38_trim.chrom.sizes"),
                'mm10': ("/dors/capra_lab/users/bentonml/data/dna/mm10/mm10_blacklist_gap.bed", "/dors/capra_lab/data/dna/mouse/mm10/mm10_trim.chrom.sizes")
                    }
    blacklist, sizes = path_dict[build]
    
    return blacklist, sizes


def shuffle(test_bed, shuf_path, sample_id, iter_, build, include):
    
    out = os.path.join(shuf_path, f"shuf-{sample_id}-{iter_}.bed")  # write file

    BLACKLIST, CHROM_SZ = loadConstants(build)

    if include is not None:    
        #-maxTries 5000
        BEDshuf = f"bedtools shuffle -i {test_bed} -g {CHROM_SZ} -chrom -noOverlapping  -incl {include} -excl {test_bed}> {out}" 

    else:
        BEDshuf = f"bedtools shuffle -i {test_bed} -g {CHROM_SZ} -excl {BLACKLIST} -chrom -noOverlapping -maxTries 5000 > {out}" 

    #print(BEDshuf)
    os.system(BEDshuf)

In [44]:
###
#   Main
###


def main(argv):
    
    
    #num_cores = multiprocessing.cpu_count()
    num_cores = 16
    print("number of cores", num_cores)

    # run parallel jobs

    Parallel(n_jobs=num_cores, verbose=100, prefer="threads")(delayed(shuffle)(TEST_BED, SHUF_PATH, SAMPLE_ID, i, BUILD, INCLUDE) for i in np.arange(ITERS))


    shuffle(TEST_BED, SHUF_PATH, SAMPLE_ID, ITERS, BUILD, INCLUDE)
    
if __name__ == "__main__":
    main(sys.argv[1:])

number of cores 16
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   0 out of   0 | elapsed:    0.0s finished


## -chrom function is not working perfectly. Especially for chr1,2,3

In [45]:
%%bash
cut -f 1 /data/hodges_lab/ATAC-STARR_B-cells/results/results_human-evolution/global_cis_trans/cis+trans-defined_regions/cis_HH-vs-MH.regions.bed | sort | uniq -c


   1723 chr1
    734 chr10
    865 chr11
    900 chr12
    322 chr13
    651 chr14
    547 chr15
    659 chr16
    827 chr17
    231 chr18
    892 chr19
   1209 chr2
    421 chr20
    162 chr21
    332 chr22
   1017 chr3
    565 chr4
    871 chr5
    909 chr6
    770 chr7
    609 chr8
    616 chr9


In [46]:
%%bash
cut -f 1 /data/hodges_lab/ATAC-STARR_B-cells/data/hansen-fong/bkgd_sharedAcc_regions/shuf-cis_HH-vs-MH.regions-0.bed | sort | uniq -c

    750 chr1
    758 chr10
    777 chr11
    731 chr12
    600 chr13
    703 chr14
    719 chr15
    760 chr16
    730 chr17
    599 chr18
    763 chr19
    763 chr2
    701 chr20
    527 chr21
    689 chr22
    795 chr3
    696 chr4
    758 chr5
    750 chr6
    784 chr7
    718 chr8
    761 chr9
