This is the integration test for BPReveal. It trains up an OSKN model and runs a full suite of analysis on it.

# Setup

In [None]:
import os
os.environ["PATH"] = os.environ["PATH"] + ":/n/apps/CentOS7/bin/"
import bpreveal
print(bpreveal.__version__)
import bpreveal.utils as utils
from bpreveal.tools.slurm import configSlurm, jobsNonGpu, jobsGpu, jobsLocal, writeDependencyScript
import json
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10,8]
plt.rcParams['figure.dpi'] = 150
import numpy as np
import pybedtools
import pysam
import pyBigWig
import h5py
import bpreveal.plotting as bprplots
import bpreveal.motifUtils as motifUtils
import bpreveal.colors as bprcolors
import tqdm

In [None]:
BASE_DIRECTORY="/n/projects/cm2363/public-bpreveal/5.2.0-rc2/repo"
BPREVEAL_VERSION = "5.2.0"
WORKING_DIRECTORY=BASE_DIRECTORY + "/test/acceptance/oskn-5.2.0"
DATA_DIRECTORY="/n/projects/cm2363/bpreveal/demoData/oskn"
SCRIPTS_DIR="/n/projects/cm2363/manuscript-bpreveal/src"
SRC_DIR = BASE_DIRECTORY + "/src"
CONDA_ENV_NAME="/n/projects/cm2363/public-bpreveal/5.2.0-rc2/env"
SLURM_CONFIG=configSlurm(["/home/cm2363/.zshrc"],
                         CONDA_ENV_NAME, WORKING_DIRECTORY, maxJobs=64)
GENOME_FASTA="/n/data1/genomes/indexes/mm10/mm10.fa"
TF_NAMES = ["oct4", "sox2", "klf4", "nanog"]
TEST_CHROMS = ["chr" + str(x) for x in [1, 8, 9]]
VAL_CHROMS = ["chr" + str(x) for x in [2,3,4]]
TRAIN_CHROMS = ["chr" + str(x) for x in [5,6,7,10,11,12,13,14,15,16,17,18,19]]
NUM_THREADS_MAJOR=70
NUM_THREADS_MINOR=20

COLLECT_COVERAGE = True

NUM_EPOCHS = 200
LOG_LEVEL="INFO"
windowStart = 180924752-1000
windowEnd = 180925152+1000
windowLen = windowEnd - windowStart
windowChrom = "chr1"
bgProbs = [(1-0.42) /2, 0.21, 0.21, (1-0.42) /2]
patternsToScan = {}
for tf in TF_NAMES:
    for model in ["solo", "residual", "combined", "transformation"]:
        for mode in ["profile", "counts"]:
            patNames = [[x, tf[0] + mode[0] + str(x)] for x in range(5)]
            patternsToScan[f"{tf}_{model}_{mode}"] = {"pos": patNames} 


In [None]:
def constructCommand(executable, coverage=COLLECT_COVERAGE, shortProgName=None):
    execStr = executable + " "
    if shortProgName is None:
        shortProgName = executable
    if coverage:
        execStr = f"coverage run $(which {execStr}) "
    return execStr

In [None]:
!mkdir -p {WORKING_DIRECTORY}/input
!mkdir -p {WORKING_DIRECTORY}/bed
!mkdir -p {WORKING_DIRECTORY}/json
!mkdir -p {WORKING_DIRECTORY}/logs
!mkdir -p {WORKING_DIRECTORY}/models
!mkdir -p {WORKING_DIRECTORY}/modisco
!mkdir -p {WORKING_DIRECTORY}/pred
!mkdir -p {WORKING_DIRECTORY}/shap
!mkdir -p {WORKING_DIRECTORY}/slurm
!mkdir -p {WORKING_DIRECTORY}/scan



In [None]:
with open(WORKING_DIRECTORY + "/slurm/.coveragerc", "w") as fp:
    fp.write(
        "[run]\n"
        "branch = True\n"
        "concurrency = multiprocessing\n"
        "omit = /tmp/*\n"
        "disable_warnings =\n"
        "    module-not-measured\n"
        "    module-not-imported\n"
        "parallel = true\n"
        f"source = {SRC_DIR}\n    bpreveal\n"
        "\n[report]\n"
        "exclude_also =\n    assert\n")


In [None]:
#Start building a list of jobs to run with dependencies
jobSpecs = []

# Length calc

In [None]:
OUTPUT_LENGTH=1000
CONV1_SIZE=7
PROFILE_CONV_SIZE=7
input_length_str = !lengthCalc --output-len {OUTPUT_LENGTH} \
                               --n-dil-layers 9 \
                               --conv1-kernel-size {CONV1_SIZE} \
                               --profile-kernel-size {PROFILE_CONV_SIZE}
INPUT_LENGTH=int(input_length_str[0])
print(INPUT_LENGTH)
RECEPTIVE_FIELD=INPUT_LENGTH - OUTPUT_LENGTH + 1
print(RECEPTIVE_FIELD)
BUFFER = (INPUT_LENGTH - OUTPUT_LENGTH) // 2
print(BUFFER)
MAX_JITTER = 100

In [None]:
slurmNameCheckLength = jobsNonGpu(SLURM_CONFIG,
    [constructCommand("lengthCalc") + "--output-len "
     "{0:d}  --n-dil-layers 9 --conv1-kernel-size 25 "
     "--profile-kernel-size 25".format(OUTPUT_LENGTH)], 
            "checkLengthCalc", 1, 1, "0:01:00")
jobSpecs.append([slurmNameCheckLength, []])

# Prepare bed

In [None]:
bigwigFileNames = [[DATA_DIRECTORY + "/" + tfName + "/counts." + strand + ".bw"  
                   for strand in ["pos", "neg"]]
                  for tfName in TF_NAMES]
summitBedFnames = [DATA_DIRECTORY + "/" + tfName + "/idr-optimal-set.summit.bed" 
                   for tfName in TF_NAMES]
summitBedFnames += [DATA_DIRECTORY + "/peaks-bak/" + tfName + ".bed"
                    for tfName in TF_NAMES]
headSpec = [{"bigwig-names" : flist, "max-quantile" : 1, "min-counts" : 1} 
          for flist in bigwigFileNames]


In [None]:
prepareBedPeaksConfig = {
    "heads" : headSpec, 
    "splits" : {"test-chroms"  : TEST_CHROMS, 
                "val-chroms"   : VAL_CHROMS,
                "train-chroms" : TRAIN_CHROMS,
                "regions" : summitBedFnames},
    "genome" : GENOME_FASTA,
    "output-length" : OUTPUT_LENGTH, 
    "input-length" : INPUT_LENGTH,
    "max-jitter" : MAX_JITTER,
    "output-prefix" : WORKING_DIRECTORY + "/bed/peak", 
    "resize-mode" : "center", 
    "remove-overlaps" : True,
    "overlap-max-distance" : 100,
    "num-threads" : NUM_THREADS_MAJOR,
    "verbosity" : LOG_LEVEL}

with open(WORKING_DIRECTORY + "/json/prepareBedPeaks.json", "w") as fp:
    json.dump(prepareBedPeaksConfig, fp, indent=4)


In [None]:
slurmNamePrepareBedPeaks = jobsNonGpu(SLURM_CONFIG, 
        [constructCommand("prepareBed") + " {0:s}/json/prepareBedPeaks.json".format(WORKING_DIRECTORY)], 
        "prepareBedPeaks", NUM_THREADS_MAJOR, 50, "1:00:00")
jobSpecs.append([slurmNamePrepareBedPeaks, []])


# Tile Genome

In [None]:
backgroundBase =  "--genome {0:s} --output-length {1:d} --input-length {2:d} "\
                 "--chrom-edge-boundary 100000 --spacing 10000 --output-bed {3:s} "\
                 "{4:s} {5:s}"
blacklistArgs = "--blacklist {0:s} --blacklist {1:s}".format(
    WORKING_DIRECTORY + "/bed/peak_all.bed",
    WORKING_DIRECTORY + "/bed/peak_reject.bed")

chromArgs = ' '.join(["--allow-chrom {0:s}".format(c) for c in (TRAIN_CHROMS + TEST_CHROMS + VAL_CHROMS)])

cmdGenBackground = constructCommand("tileGenome") + backgroundBase.format(
    GENOME_FASTA, OUTPUT_LENGTH, INPUT_LENGTH, WORKING_DIRECTORY + "/bed/tiling_all.bed",
    blacklistArgs, chromArgs)

slurmNameGenBackground = jobsNonGpu(SLURM_CONFIG, [cmdGenBackground], "genBackground", 1, 10, "1:00:00")
jobSpecs.append([slurmNameGenBackground, [slurmNamePrepareBedPeaks]])

In [None]:
biasBigwigFnames = [DATA_DIRECTORY + "/patchcap/counts." + strand + ".bw" 
                    for strand in ["pos", "neg"]]

In [None]:
biasHeadSpec = [{"bigwig-names" : flist, "max-quantile" : 0.6, "min-quantile" : 0.01} 
          for flist in bigwigFileNames]
biasHeadSpec = biasHeadSpec + [{"bigwig-names" : biasBigwigFnames, 
                            "max-quantile" : 0.95, 
                            "min-quantile" : 0.1} ]
prepareBedNonPeaksConfig = {
    "heads" : biasHeadSpec, 
    "splits" : {"test-chroms"  : TEST_CHROMS, 
                "val-chroms"   : VAL_CHROMS,
                "train-chroms" : TRAIN_CHROMS,
                "regions" : [WORKING_DIRECTORY + "/bed/tiling_all.bed"]},
    "genome" : GENOME_FASTA,
    "output-length" : OUTPUT_LENGTH,
    "input-length" : INPUT_LENGTH,
    "max-jitter" : MAX_JITTER,
    "output-prefix" : WORKING_DIRECTORY + "/bed/nonpeak", 
    "remove-overlaps" : False,
    "resize-mode" : "center",
    "num-threads" : NUM_THREADS_MAJOR,
    "verbosity" : LOG_LEVEL}

with open(WORKING_DIRECTORY + "/json/prepareBedNonPeaks.json", "w") as fp:
    json.dump(prepareBedNonPeaksConfig, fp)

In [None]:
slurmNamePrepareBedNonPeaks = jobsNonGpu(SLURM_CONFIG,
    [constructCommand("prepareBed") + "{0:s}/json/prepareBedNonPeaks.json".format(WORKING_DIRECTORY)], 
    "prepareBedNonPeaks", NUM_THREADS_MAJOR, 50, "1:00:00")
jobSpecs.append([slurmNamePrepareBedNonPeaks, [slurmNameGenBackground]])

# Building the training dataset

In [None]:
configFnames = []
for split in ["train", "val"]:
    for dataset in ["peak", "nonpeak"]:
        heads = []
        for tfId, tfName in enumerate(TF_NAMES):
            if(dataset == 'peak'):
                heads.append({
                    "revcomp-task-order" : "auto",
                    "bigwig-files" : bigwigFileNames[tfId]})
            else:
                heads.append({
                    "revcomp-task-order" : "auto",
                    "bigwig-files" : biasBigwigFnames})
        config = {"genome" : GENOME_FASTA, 
                  "input-length" : INPUT_LENGTH,
                  "output-length" : OUTPUT_LENGTH,
                  "max-jitter" : MAX_JITTER,
                  "regions" : WORKING_DIRECTORY + "/bed/" + dataset + "_" + split + ".bed",
                  "output-h5" : WORKING_DIRECTORY + "/input/" + dataset + "_" + split + ".h5",
                  "reverse-complement" : True,
                  "heads" : heads,
                  "verbosity" : LOG_LEVEL}
        configFname =WORKING_DIRECTORY + "/json/prepareInput" + dataset + "_" + split+ ".json" 
        with open(configFname, "w") as fp:
            json.dump(config, fp, indent=2)
        configFnames.append(configFname)
slurmNamePrepareTrainingData = jobsNonGpu(SLURM_CONFIG,
    [constructCommand("prepareTrainingData") + "{0:s}".format(configFname) 
                for configFname in configFnames], 
    "prepareTrainingData", 2, 20, "1:00:00")
jobSpecs.append([slurmNamePrepareTrainingData, [slurmNamePrepareBedNonPeaks]])
        



# Training the bias model

In [None]:

heads = []
for tfName in TF_NAMES:
    heads.append({"num-tasks" : 2, 
                  "profile-loss-weight" : 1, 
                  "head-name" : "patchcap_" + tfName,
                  "counts-loss-weight" : 10,
                  "counts-loss-frac-target" : 0.1})

biasTrainConfig = {
    "settings" : {
        "output-prefix" : WORKING_DIRECTORY + "/models/solo", 
        "epochs" : NUM_EPOCHS,
        "max-jitter" : 100,
        "early-stopping-patience" : 20,
        "batch-size" : 128,
        "learning-rate" : 0.004,
        "learning-rate-plateau-patience" : 5,
        "architecture" : {
            "architecture-name" : "bpnet", 
            "input-length" : INPUT_LENGTH,
            "output-length" : OUTPUT_LENGTH,
            "model-name" : "patchcap",
            "model-args" : "",
            "filters" : 16,
            "layers" : 9,
            "input-filter-width" : CONV1_SIZE,
            "output-filter-width" : PROFILE_CONV_SIZE
        }
    },
    "train-data" : WORKING_DIRECTORY + "/input/nonpeak_train.h5",
    "val-data" : WORKING_DIRECTORY + "/input/nonpeak_val.h5",
    "heads" : heads,
    "verbosity" : LOG_LEVEL
}


with open(WORKING_DIRECTORY + "/json/trainBias.json", "w") as fp:
    json.dump(biasTrainConfig, fp, indent=4)
    

In [None]:
slurmNameTrainSoloModel = jobsGpu(SLURM_CONFIG, 
    [constructCommand("trainSoloModel") + "{0:s}".format(WORKING_DIRECTORY + "/json/trainBias.json")],
    "trainSolo", 10, 30, "10:00:00")
jobSpecs.append([slurmNameTrainSoloModel, [slurmNamePrepareTrainingData]])


# makePredictions

In [None]:
biasPredictConfig = {
    "settings" : {
        "output-h5" : WORKING_DIRECTORY + "/pred/patchcap.h5", 
        "batch-size" : 128,
        "heads" : len(TF_NAMES),
        
        "architecture" : {
            "model-file" : WORKING_DIRECTORY + "/models/solo.keras",
            "input-length" : INPUT_LENGTH,
            "output-length" : OUTPUT_LENGTH
        }
    },
    "genome" : GENOME_FASTA, 
    "bed-file" : WORKING_DIRECTORY + "/bed/peak_all.bed",
    "num-threads" : 4,
    "verbosity" : LOG_LEVEL
}


with open(WORKING_DIRECTORY + "/json/predictBias.json", "w") as fp:
    json.dump(biasPredictConfig, fp)

In [None]:
slurmNamePredictSolo = jobsNonGpu(SLURM_CONFIG,
    [constructCommand("makePredictions") + "{0:s}".format(WORKING_DIRECTORY + "/json/predictBias.json")],
    "predictSolo", 10, 50, "10:00:00")
jobSpecs.append([slurmNamePredictSolo, [slurmNameTrainSoloModel]])

# PredictToBigwig

In [None]:

bwCmdBase = "--h5 {wd:s}/pred/patchcap.h5 " +\
            "--bw {wd:s}/pred/{outf:s}.bw "+\
            "--head-id {hid:d} --task-id {tid:d} --mode profile "+\
            "--threads {nt:d}"
bwCmds = []
for headid, tfname in enumerate(TF_NAMES):
    for tid, strand in enumerate(["positive", "negative"]):
        cmd = constructCommand("predictToBigwig") +  bwCmdBase.format(wd=WORKING_DIRECTORY, 
                               outf=tfname + "_solo_" + strand,
                               hid=headid, tid=tid,
                               nt=NUM_THREADS_MINOR)
        bwCmds.append(cmd)

slurmNamePredToBigwigSolo = jobsNonGpu(SLURM_CONFIG, bwCmds, 
           "predToBigwigSolo", NUM_THREADS_MINOR, 20, "1:00:00")

jobSpecs.append([slurmNamePredToBigwigSolo, [slurmNamePredictSolo]])


# makeLossPlots

In [None]:
cmdLossPlots = constructCommand("makeLossPlots") + " --json {0:s}/models/solo.history.json --output {0:s}/models/solo.png".format(WORKING_DIRECTORY)
slurmNameLossPlotsSolo = jobsNonGpu(SLURM_CONFIG, [cmdLossPlots], "lossPlots", 1, 20, "1:00:00")
jobSpecs.append([slurmNameLossPlotsSolo, [slurmNameTrainSoloModel]])

# Training the transformation model

In [None]:
heads = []
for tfName in TF_NAMES:
    heads.append({"num-tasks" : 2, 
                  "profile-loss-weight" : 1, 
                  "head-name" : "patchcap_" + tfName,
                  "counts-loss-weight" : 100,
                  "counts-loss-frac-target" : 0.1})

transformationTrainConfig = {
    "settings" : {
        "output-prefix" : WORKING_DIRECTORY + "/models/transformation", 
        "epochs" : NUM_EPOCHS,
        "early-stopping-patience" : 4,
        "batch-size" : 128,
        "learning-rate" : 0.04,
        "learning-rate-plateau-patience" : 2,
        "solo-model-file" : WORKING_DIRECTORY + "/models/solo.keras",
        "input-length" : INPUT_LENGTH, 
        "output-length" : OUTPUT_LENGTH,
        "max-jitter" : 100,
        "profile-architecture" : {
            "name" : "simple", 
            "types" : ["linear", "sigmoid"]},
        "counts-architecture" : {
            "name" : "simple", 
            "types" : ["linear", "sigmoid"]}},
        
    "train-data" : WORKING_DIRECTORY+ "/input/peak_train.h5",
    "val-data" : WORKING_DIRECTORY + "/input/peak_val.h5",
    "heads" : heads,
    "verbosity" : LOG_LEVEL
}
with open(WORKING_DIRECTORY + "/json/trainTransformation.json", "w") as fp:
    json.dump(transformationTrainConfig, fp)

In [None]:
slurmNameTrainTransformation = jobsGpu(SLURM_CONFIG, 
    [constructCommand("trainTransformationModel") + " {0:s}".format(WORKING_DIRECTORY + "/json/trainTransformation.json")],
        "trainTransformation", 10, 60, "10:00:00")
jobSpecs.append([slurmNameTrainTransformation, [slurmNameTrainSoloModel]])

# Transformation prediction

In [None]:
transformPredictConfig = {
    "settings" : {
        "output-h5" : WORKING_DIRECTORY + "/pred/transformation.h5", 
        "batch-size" : 128,
        "heads" : len(TF_NAMES),
        
        "architecture" : {
            "model-file" : WORKING_DIRECTORY + "/models/transformation.keras",
            "input-length" : INPUT_LENGTH,
            "output-length" : OUTPUT_LENGTH
        }
    },
    "genome" : GENOME_FASTA, 
    "bed-file" : WORKING_DIRECTORY + "/bed/peak_all.bed",
    "num-threads" : 1,
    "verbosity" : LOG_LEVEL
}


with open(WORKING_DIRECTORY + "/json/predictTransformation.json", "w") as fp:
    json.dump(transformPredictConfig, fp)


In [None]:
slurmNamePredictTransformation = jobsNonGpu(SLURM_CONFIG, 
    [constructCommand("makePredictions") + " {0:s}".format(WORKING_DIRECTORY + "/json/predictTransformation.json")],
        "predictTransformation", 2, 50, "10:00:00")
jobSpecs.append([slurmNamePredictTransformation, [slurmNameTrainTransformation]])

bwCmdBase = constructCommand("predictToBigwig") +\
          "--h5 {wd:s}/pred/transformation.h5 " +\
          "--bw {wd:s}/pred/{outf:s}.bw "+\
          "--head-id {hid:d} --task-id {tid:d} --mode profile "+\
          "--threads {nt:d}"

bwCmds = []
for headid, tfname in enumerate(TF_NAMES):
    for tid, strand in enumerate(["positive", "negative"]):
        cmd = bwCmdBase.format(wd=WORKING_DIRECTORY, 
                               outf=tfname + "_transformation_" + strand,
                               hid=headid, tid=tid,
                               nt=NUM_THREADS_MINOR)
        bwCmds.append(cmd)

slurmNamePredToBigwigTransformation = jobsNonGpu(SLURM_CONFIG, bwCmds, 
           "predToBigwigTransformation", NUM_THREADS_MINOR, 20, "1:00:00")

jobSpecs.append([slurmNamePredToBigwigTransformation, [slurmNamePredictTransformation]])


# Training the combined model

In [None]:
heads = []
for i, tfName in enumerate(TF_NAMES):
    heads.append({"num-tasks" : 2, 
                  "profile-loss-weight" : 1, 
                  "head-name" : "combined_" + tfName,
                  "counts-loss-weight" : 100,
                  "counts-loss-frac-target" : 0.1,
                  "use-bias-counts" : i == 0 # Just to test the system, there's no reason to do this in a real model.
                 })

combinedTrainConfig = {
    "settings" : {
        "output-prefix" : WORKING_DIRECTORY + "/models/joint", 
        "epochs" : NUM_EPOCHS,
        "early-stopping-patience" : 13,
        "batch-size" : 128,
        "learning-rate" : 0.004,
        "learning-rate-plateau-patience" : 5,
        "max-jitter" : 100,
        "transformation-model" : {
            "transformation-model-file" : WORKING_DIRECTORY + "/models/transformation.keras"
        },
        "architecture" : {
            "architecture-name" : "bpnet", 
            "input-length" : INPUT_LENGTH,
            "output-length" : OUTPUT_LENGTH,
            "model-name" : "joint",
            "model-args" : "",
            "filters" : 64,
            "layers" : 9,
            "input-filter-width" : 7,
            "output-filter-width" : 7
        }
    },
    "train-data" : WORKING_DIRECTORY + "/input/peak_train.h5",
    "val-data" : WORKING_DIRECTORY + "/input/peak_val.h5",
    "heads" : heads,
    "verbosity" : "DEBUG" # I need some debug output to test showTrainingProgress.
}


with open(WORKING_DIRECTORY + "/json/trainCombined.json", "w") as fp:
    json.dump(combinedTrainConfig, fp)

In [None]:

slurmNameTrainCombined = jobsGpu(SLURM_CONFIG,
    [constructCommand("trainCombinedModel") + "{0:s}".format(WORKING_DIRECTORY + "/json/trainCombined.json")],
    "trainCombined", 10, 60, "10:00:00")
jobSpecs.append([slurmNameTrainCombined, [slurmNameTrainTransformation]])

# Predict combined

In [None]:
combinedPredictConfig = {
    "settings" : { 
        "output-h5" : WORKING_DIRECTORY + "/pred/combined.h5", 
        "batch-size" : 128,
        "heads" : len(TF_NAMES),
        
        "architecture" : {
            "model-file" : WORKING_DIRECTORY + "/models/joint_combined.keras",
            "input-length" : INPUT_LENGTH,
            "output-length" : OUTPUT_LENGTH
        }
    },
    "genome" : GENOME_FASTA,
    "bed-file" : WORKING_DIRECTORY + "/bed/peak_all.bed",
    "num-threads" : 2,
    "verbosity" : LOG_LEVEL
}
with open(WORKING_DIRECTORY + "/json/predictCombined.json", "w") as fp:
    json.dump(combinedPredictConfig, fp)
#For the residual model, I just need to change a few terms:
residualPredictConfig = combinedPredictConfig
residualPredictConfig["settings"]["output-h5"] = WORKING_DIRECTORY + "/pred/residual.h5"
residualPredictConfig["settings"]["architecture"]["model-file"] = WORKING_DIRECTORY + "/models/joint_residual.keras"
with open(WORKING_DIRECTORY + "/json/predictResidual.json", "w") as fp:
    json.dump(residualPredictConfig, fp)

In [None]:
slurmNamePredictCombined = jobsGpu(SLURM_CONFIG, 
    [constructCommand("makePredictions") + " {0:s}".format(WORKING_DIRECTORY + "/json/predictCombined.json"),
     constructCommand("makePredictions")+ " {0:s}".format(WORKING_DIRECTORY + "/json/predictResidual.json")],
    "predictCombined", 1, 50, "10:00:00")
jobSpecs.append([slurmNamePredictCombined, [slurmNameTrainCombined]])

bwCmdBase = "--h5 {wd:s}/pred/{inf:s}.h5 " +\
            "--bw {wd:s}/pred/{outf:s}.bw "+\
            "--head-id {hid:d} --task-id {tid:d} --mode profile "+\
            "--threads {nt:d}"
bwCmds = []
for modelType in ["residual", "combined"]:
    for headid, tfname in enumerate(TF_NAMES):
        for tid, strand in enumerate(["positive", "negative"]):
            cmd =constructCommand("predictToBigwig") + bwCmdBase.format(wd=WORKING_DIRECTORY, 
                                   inf=modelType,
                                   outf=tfname + "_" + modelType + "_" + strand,
                                   hid=headid, tid=tid,
                                   nt=NUM_THREADS_MINOR)
            bwCmds.append(cmd)

slurmNamePredToBigwigCombined = jobsNonGpu(SLURM_CONFIG, bwCmds, 
           "predToBigwigCombined", NUM_THREADS_MINOR, 20, "1:00:00")

jobSpecs.append([slurmNamePredToBigwigCombined, [slurmNamePredictCombined]])

# Deriving flat importance scores

In [None]:
def makeInterpretJson(tfNum, model, modelName):
    return {
        "genome" : GENOME_FASTA,
        "bed-file" : WORKING_DIRECTORY + "/bed/peak_test.bed",
        "model-file" : f"{WORKING_DIRECTORY}/models/{model}.keras", 
        "input-length" : INPUT_LENGTH,
        "output-length" : OUTPUT_LENGTH,
        "heads" : len(TF_NAMES),
        "head-id": tfNum,
        "profile-task-ids" : [0,1],
        "profile-h5" : f"{WORKING_DIRECTORY}/shap/{TF_NAMES[tfNum]}_{modelName}_profile.h5",
        "counts-h5" : f"{WORKING_DIRECTORY}/shap/{TF_NAMES[tfNum]}_{modelName}_counts.h5",
        "num-shuffles" : 20,
        "kmer-size" : tfNum + 1, # Just to exercise the code path.
        "verbosity" : LOG_LEVEL}
cmds = []
for tfNum in range(len(TF_NAMES)):
    for model, modelName in [["joint_combined", "combined"], ["joint_residual", "residual"], ["transformation", "transformation"], ["solo", "solo"]]:
        fname = f"{WORKING_DIRECTORY}/json/shap_{TF_NAMES[tfNum]}_{model}.json"
        cmds.append(constructCommand("interpretFlat") + "{0:s}".format(fname))
        with open(fname, "w") as fp:
            json.dump(makeInterpretJson(tfNum, model, modelName), fp)
slurmNameInterpretFlat = jobsGpu(SLURM_CONFIG, cmds,
        "interpretFlat", 5, 50, "10:00:00")
jobSpecs.append([slurmNameInterpretFlat, [slurmNameTrainCombined]])

In [None]:
shapBwCmdBase = "--h5 {wd:s}/shap/{tf:s}_{model:s}_{readout:s}.h5 " +\
                "--bw {wd:s}/shap/{tf:s}_{model:s}_{readout:s}.bw "
shapBwCmds = []
for tfname in TF_NAMES:
    for modelName in ["combined", "residual", "transformation", "solo"]:
        for readout in ["profile", "counts"]:
            cmd = constructCommand("shapToBigwig") + shapBwCmdBase.format(wd=WORKING_DIRECTORY, 
                                       tf=tfname,
                                       readout=readout,
                                       model=modelName)
            shapBwCmds.append(cmd)

slurmNameShapToBigwig = jobsNonGpu(SLURM_CONFIG, shapBwCmds, 
           "shapToBigwig", 2, 20, "1:00:00")
jobSpecs.append([slurmNameShapToBigwig, [slurmNameInterpretFlat]])


# ShapToNumpy

In [None]:

shapToNumpyCmdBase = "--h5 {wd:s}/shap/{tf:s}_{model:s}_{readout:s}.h5 " +\
                     "--seqs {wd:s}/shap/seqs_{tf:s}_{model:s}_{readout:s}.npz "+\
                     "--scores {wd:s}/shap/scores_{tf:s}_{model:s}_{readout:s}.npz "
shapToNumpyCmds = []
for tfname in TF_NAMES:
    for modelName in ["combined", "residual", "transformation", "solo"]:
        for readout in ["profile", "counts"]:
            cmd = constructCommand("shapToNumpy") + shapToNumpyCmdBase.format(wd=WORKING_DIRECTORY, 
                                            tf=tfname,
                                            readout=readout,
                                            model=modelName)
            shapToNumpyCmds.append(cmd)

slurmNameShapToNumpy = jobsNonGpu(SLURM_CONFIG, shapToNumpyCmds, 
           "shapToNumpy", 2, 20, "1:00:00")
jobSpecs.append([slurmNameShapToNumpy, [slurmNameInterpretFlat]])

# Modisco

In [None]:
modiscoCmdBase = "mkdir -p {wd:s}/modisco/{tf:s}_{model:s}_{readout:s}\n" +\
          constructCommand("modisco", coverage=False) + " motifs " +\
              "-s {wd:s}/shap/seqs_{tf:s}_{model:s}_{readout:s}.npz " +\
              "-a {wd:s}/shap/scores_{tf:s}_{model:s}_{readout:s}.npz "+\
              "-n 10000 " +\
              "-w 1000 "+\
              "-o {wd:s}/modisco/{tf:s}_{model:s}_{readout:s}/modisco.h5 "
modiscoCmds = []
for tfname in TF_NAMES:
    for modelName in ["combined", "residual", "transformation", "solo"]:
        for readout in ["profile", "counts"]:
            cmd = modiscoCmdBase.format(wd=WORKING_DIRECTORY, 
                                        tf=tfname,
                                        readout=readout,
                                        model=modelName)
            modiscoCmds.append(cmd)
slurmNameModisco = jobsNonGpu(SLURM_CONFIG, modiscoCmds, 
           "modisco", NUM_THREADS_MINOR, 40, "10:00:00")
jobSpecs.append([slurmNameModisco, [slurmNameShapToNumpy]])

In [None]:
reportCmdBase = constructCommand("modisco", coverage=False) + " report " +\
              "-i {wd:s}/modisco/{tf:s}_{model:s}_{readout:s}/modisco.h5 " +\
              "-o {wd:s}/modisco/{tf:s}_{model:s}_{readout:s}/ "+\
              "-n 2 " +\
              "-m /n/data1/JASPAR/2022/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt " +\
              "\n\n{sd:s}/annotateModiscoHtml --vertebrate" +\
              " {wd:s}/modisco/{tf:s}_{model:s}_{readout:s}/motifs.html " + \
              " > {wd:s}/modisco/{tf:s}_{model:s}_{readout:s}/motifs_names.html"
              
reportCmds = []
for tfname in TF_NAMES:
    for modelName in ["combined", "residual", "transformation", "solo"]:
        for readout in ["profile", "counts"]:
            cmd = reportCmdBase.format(wd=WORKING_DIRECTORY, 
                                       sd=SCRIPTS_DIR,
                                       tf=tfname,
                                       readout=readout,
                                       model=modelName)
            reportCmds.append(cmd)

slurmNameModiscoReport = jobsNonGpu(SLURM_CONFIG, reportCmds, 
           "modiscoReport", 5, 5, "1:00:00")
jobSpecs.append([slurmNameModiscoReport, [slurmNameModisco]])

# Making a PISA plot

In [None]:

def writeRegion(genome, outFp, regionStart):
    genomeStart = regionStart - BUFFER
    genomeEnd = genomeStart + INPUT_LENGTH
    seq = genome.fetch(windowChrom, genomeStart, genomeEnd)
    outFp.write(">{0:d}\n".format(regionStart))
    outFp.write(seq.upper())
    outFp.write("\n")

with open(WORKING_DIRECTORY + "/shap/pisa_regions.fa", "w") as fp:
    with pysam.FastaFile(GENOME_FASTA) as genome:
        for regionStart in range(windowStart, windowEnd):
            writeRegion(genome, fp, regionStart)


In [None]:

cmds = []
for tfid in range(len(TF_NAMES)):
    for strand in [0,1]:
        for model, modelName in [['joint_residual', "residual"], ['joint_combined', "combined"], ['transformation', "transformation"], ['solo', "solo"]]:
            task_name = TF_NAMES[tfid] + "_" + ["positive", "negative"][strand]
            pisa_config = {"model-file" : f"{WORKING_DIRECTORY}/models/{model}.keras", 
                           "fasta-file" : WORKING_DIRECTORY + "/shap/pisa_regions.fa", 
                           "num-shuffles" : 20, 
                           "head-id" : tfid,
                           "task-id" : strand,
                           "output-h5" : f"{WORKING_DIRECTORY}/shap/pisa_{modelName}_{task_name}.h5",
                           "input-length" : INPUT_LENGTH,
                           "output-length" : OUTPUT_LENGTH,
                           "kmer-size" : strand + 1, # Just to exercise both possibilities.
                           "num-threads" : strand + 1,
                           "make-predictions" : True,
                           "correct-receptive-field": True,
                           "verbosity" : LOG_LEVEL}
            jsonFname = f"{WORKING_DIRECTORY}/json/pisa_{modelName}_{task_name}.json"
            with open(jsonFname, "w") as fp:
                json.dump(pisa_config, fp)
            cmds.append(constructCommand("interpretPisa") + " {0:s}".format(jsonFname))

slurmNameInterpretPisa = jobsGpu(SLURM_CONFIG, cmds, "interpretPisa", 5, 20, "10:00:00")
jobSpecs.append([slurmNameInterpretPisa, [slurmNameTrainCombined]])

In [None]:
# When the PISA data are available, we can make the pisa plot and graph.

# Metrics

In [None]:
metricsCmdBase = constructCommand("metrics") + " --reference {ddir:s}/{tf:s}/counts.{strand:s}.bw " +\
         "--pred {wd:s}/pred/{tf:s}_combined_{longstrand:s}.bw " +\
         "--regions {wd:s}/bed/peak_all.bed " +\
         "--threads {nt:d} --apply-abs --skip-zeroes"
cmds = []
for tfName in TF_NAMES:
    for lstr, sstr in (("positive", "pos"), ("negative", "neg")):
        metricsCmd = metricsCmdBase.format(wd = WORKING_DIRECTORY, ddir=DATA_DIRECTORY, 
                                           tf=tfName, strand = sstr, longstrand=lstr,
                                           nt=NUM_THREADS_MINOR)
        cmds.append(metricsCmd)
slurmNameMetrics = jobsNonGpu(SLURM_CONFIG, cmds, "metrics", NUM_THREADS_MINOR, 20, "1:00:00")
jobSpecs.append([slurmNameMetrics, [slurmNamePredToBigwigCombined]])

# showModel

In [None]:
slurmNameShowModel = jobsNonGpu(SLURM_CONFIG,
    [constructCommand("showModel") + (" --model {wd:s}/models/joint_combined.keras "
     "--png {wd:s}/models/joint_combined.png").format(wd=WORKING_DIRECTORY)], 
            "checkShowModel", 1, 10, "0:05:00")
jobSpecs.append([slurmNameShowModel, [slurmNameTrainCombined]])

# checkJson

In [None]:
slurmNameCheckJson = jobsNonGpu(SLURM_CONFIG,
    [constructCommand("checkJson") + " {wd:s}/json/prepareBedNonPeaks.json".format(wd=WORKING_DIRECTORY),
     constructCommand("checkJson") + " -s prepareBed {wd:s}/json/prepareBedNonPeaks.json".format(wd=WORKING_DIRECTORY)], 
            "checkCheckJson", 1, 1, "0:05:00")
jobSpecs.append([slurmNameCheckJson, []])

# Easy functions

In [None]:
with open(WORKING_DIRECTORY + "/slurm/testEasy.py", "w") as fp:
    prog = ("#!/usr/bin/env python3\nfrom bpreveal import utils\nimport random\n"
            "utils.setVerbosity('WARNING')\n"
            "import os\nos.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'\n"
            "seq=''.join(random.choices('ACGT', k={isize:d}))\n"
            "utils.easyPredict(seq, '{wd:s}/models/joint_combined.keras')\n"
            "utils.easyInterpretFlat(seq, '{wd:s}/models/joint_residual.keras', "
            "{nh:d}, 0, [0,1])\n").format(wd=WORKING_DIRECTORY, isize=INPUT_LENGTH,
                                         nh = len(TF_NAMES))
    fp.write(prog)
slurmNameCheckEasy = jobsGpu(SLURM_CONFIG,
    [("coverage run " if COLLECT_COVERAGE else "") + "{wd:s}/slurm/testEasy.py".format(wd=WORKING_DIRECTORY)], 
            "checkEasy", 3, 20, "0:15:00")
jobSpecs.append([slurmNameCheckEasy, [slurmNameTrainCombined]])

In [None]:
with open(WORKING_DIRECTORY + "/slurm/testEasyCpu.py", "w") as fp:
    prog = ("#!/usr/bin/env python3\nfrom bpreveal import utils\nimport random\n"
            "utils.setVerbosity('WARNING')\n"
            "import os\nos.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'\n"
            "seq=''.join(random.choices('ACGT', k={isize:d}))\n"
            "utils.easyPredict(seq, '{wd:s}/models/joint_combined.keras')\n"
           ).format(wd=WORKING_DIRECTORY, isize=INPUT_LENGTH)
    fp.write(prog)
slurmNameCheckEasyCpu = jobsNonGpu(SLURM_CONFIG,
    [("coverage run " if COLLECT_COVERAGE else "") + "{wd:s}/slurm/testEasyCpu.py".format(wd=WORKING_DIRECTORY)], 
            "checkEasyCpu", 3, 20, "0:15:00")
jobSpecs.append([slurmNameCheckEasyCpu, [slurmNameTrainCombined]])

# Motif scanning

In [None]:

cmdsScan = []
cmdsPostproc = []
SCAN_BASE = constructCommand("motifSeqletCutoffs") + " {cutoffFname:s}\n    " +\
            constructCommand("motifScan") + " {scanFname:s}\n    "
POSTPROC_BASE = constructCommand("motifAddQuantiles") + " --seqlet-tsv {seqletTmpTsv:s} --scan-tsv {scanTmpTsv:s} " +\
                    "--seqlet-out {seqletTsv:s} --scan-out {scanTsv:s}\n    " +\
                constructCommand("bestMotifsOnly") + " --metric contrib_match --in-tsv {scanTsv:s} --out-bed {scanBed:s}\n    " + \
                ("coverage run " if COLLECT_COVERAGE else "") + "$(which bestMotifsOnly) --metric contrib_match_quantile --in-tsv {scanTsv:s} " +\
                    "--out-bed {scanBedFilt:s} --filter 'contrib_match_quantile > 0.5 or (seq_match_quantile > 0.5 and contrib_magnitude_quantile > 0.5)' \n    "


doneCombined = False
for pat in patternsToScan.keys():
    curPats = patternsToScan[pat]
    patternSpec = []
    for mcName in curPats.keys():
        patternSpec.append({
            "metacluster-name" : mcName + "_patterns", 
            "pattern-names" : ["pattern_{0:d}".format(x[0]) for x in curPats[mcName]],
            "short-names" : [x[1] for x in curPats[mcName]]})
    seqletTsv =  WORKING_DIRECTORY + "/modisco/" + pat + "/seqlets_" + pat + ".tsv"
    seqletTmpTsv =  WORKING_DIRECTORY + "/modisco/" + pat + "/seqlets_" + pat + ".tsv"
    
    seqletBed = WORKING_DIRECTORY + "/modisco/" + pat + "/seqlets_" + pat + ".bed"
    hitsTsv = WORKING_DIRECTORY + "/scan/" + pat + ".tsv"
    hitsTmpTsv = WORKING_DIRECTORY + "/scan/" + pat + ".tsv"
    hitsBed = WORKING_DIRECTORY + "/scan/" + pat + ".bed"
    hitsBedFilt = WORKING_DIRECTORY + "/scan/" + pat + "_filt.bed"
    cutoffConfigDict = {
            "seqlets-tsv" : seqletTmpTsv,
            "modisco-h5" : WORKING_DIRECTORY + "/modisco/" + pat + "/modisco.h5",
            "modisco-contrib-h5" : WORKING_DIRECTORY + "/shap/" + pat + ".h5",
            "patterns" : patternSpec, 
            "seq-match-quantile" : 0.2,
            "contrib-match-quantile" : 0.2,
            "contrib-magnitude-quantile" : 0.2,
            "trim-threshold" : 0.3,
            "trim-padding" : 1,
            "background-probs" : bgProbs,
            "modisco-window": 1000,
            "quantile-json" : WORKING_DIRECTORY + "/scan/" + pat + "_motifs.json",
            "verbosity" : LOG_LEVEL}
    if not doneCombined:
        # We want to run one scan with the integrated cutoffs script.
        # (Note that still runs the cutoffs script, it just re-does it during scanning.
        scanConfigDict = {
            "scan-settings" : {
                "scan-contrib-h5" : WORKING_DIRECTORY + "/shap/" + pat + ".h5",
                "hits-tsv" : hitsTmpTsv,
                "num-threads" : NUM_THREADS_MAJOR},
            "seqlet-cutoff-settings" : cutoffConfigDict,
            "verbosity" : LOG_LEVEL}
        doneCombined = True
    else:
        scanConfigDict = {
            "scan-settings" : {
                "scan-contrib-h5" : WORKING_DIRECTORY + "/shap/" + pat + ".h5",
                "hits-tsv" : hitsTmpTsv,
                "num-threads" : NUM_THREADS_MAJOR},
            "seqlet-cutoff-json" : WORKING_DIRECTORY + "/scan/" + pat + "_motifs.json",
            "verbosity" : LOG_LEVEL}
    scanFname = WORKING_DIRECTORY + "/json/scan_" + pat + ".json"
    cutoffFname = WORKING_DIRECTORY + "/json/cutoffs_" + pat + ".json"
    cmdStrScan = SCAN_BASE.format(scanFname = scanFname, cutoffFname = cutoffFname)
    cmdStrPostproc = POSTPROC_BASE.format(seqletTmpTsv=seqletTmpTsv, scanTmpTsv = hitsTmpTsv,
                                          seqletTsv = seqletTsv,
                                          scanTsv = hitsTsv, scanBed = hitsBed, scanBedFilt=hitsBedFilt)
    cmdsScan.append(cmdStrScan)
    cmdsPostproc.append(cmdStrPostproc)
    with open(scanFname, "w") as fp:
        json.dump(scanConfigDict, fp, indent=4)
    with open(cutoffFname, "w") as fp:
        json.dump(cutoffConfigDict, fp, indent=4)
slurmNameScan = jobsNonGpu(SLURM_CONFIG, cmdsScan, "motifScan", NUM_THREADS_MAJOR, 10, "10:00:00")
slurmNameScanPostproc = jobsNonGpu(SLURM_CONFIG, cmdsPostproc, "motifScanPostproc", 3, 50, "1:00:00")

In [None]:
jobSpecs.append([slurmNameScan, [slurmNameModisco]])
jobSpecs.append([slurmNameScanPostproc, [slurmNameScan]])

# Generating figures with PISA

In [None]:
# Make an interactive test json.
interactiveConfig = {
    "graph-configs": [],
    "plot-configs": [
        {
            "pisa": {"h5-name": WORKING_DIRECTORY + "/shap/pisa_residual_nanog_negative.h5"},
             "coordinates": {
                "genome-fasta": "/n/data1/genomes/indexes/mm10/mm10.fa",
                "midpoint-offset": 1110,
                "input-slice-width": 1000,
                "output-slice-width": 1000,
                "genome-window-start": 180923752,
                "genome-window-chrom": "chr1"
            },
            "importance": {
                "bigwig-name": WORKING_DIRECTORY + "/shap/nanog_residual_counts.bw",
                "show-sequence": True
            },
            "predictions": {
                "bigwig-name": WORKING_DIRECTORY + "/pred/nanog_residual_negative.bw"
            },
            "annotations": {
                "bed-name": WORKING_DIRECTORY + "/scan/nanog_residual_counts.bed"
            },
            "figure": {
                "left": 0.1,
                "bottom": 0.1,
                "width": 0.85,
                "height": 0.85,
                "color-span": 0.4,
                "diagonal-mode": "on",
                "miniature": False
            }
        }
    ],
    "width": 7,
    "height": 6,
    "output-gui": True
}
jsonInteractivePlot = f"{WORKING_DIRECTORY}/json/pisaInteractive.json"
with open(jsonInteractivePlot, "w") as fp:
    json.dump(interactiveConfig, fp)
            

In [None]:
# Now I'll make plots but using the interpreter features. 
baseConfig = """
(lambda workdir="{wd:s}", 
        tf="{tf:s}",
        strand="{strand:s}",
        mode="{mode:s}":"""
restConfig = """
    (lambda pisaConfig = lambda model: {"h5-name": workdir + "/shap/pisa_" + model + "_" + tf + "_" + strand + ".h5"},
            coordsConfig = lambda model: {
                "genome-fasta": "/n/data1/genomes/indexes/mm10/mm10.fa",
                "midpoint-offset": 1150,
                "input-slice-width": 300,
                "output-slice-width": 500,
                "genome-window-start": 180923752,
                "genome-window-chrom": "chr1"},
            importanceConfig= lambda model: {
                "bigwig-name": workdir + "/shap/" + tf + "_" + model + "_profile.bw",
                "show-sequence": True},
            predictionsConfig=lambda model: {
                "bigwig-name": workdir + "/pred/" + tf + "_" + model + "_" + strand + ".bw"},
            annotationsConfig=lambda model: {
                "bed-name": workdir + "/scan/" + tf + "_" + model + "_profile_filt.bed"},
            figureConfig=lambda model:{
                "left": {"combined": 0.1, "residual": 0.6, "solo": 0.1, "transformation": 0.6}[model],
                "bottom": {"combined": 0.1, "residual": 0.1, "solo": 0.6, "transformation": 0.6}[model],
                "width": 0.35,
                "height": 0.35,
                "color-span": 0.5,
                "miniature": True}:
        {
            "graph-configs": [] if mode == "plot" else [{
                    "pisa": pisaConfig(model),
                    "coordinates":  coordsConfig(model),
                    "importance":  importanceConfig(model),
                    "predictions": predictionsConfig(model),
                    "annotations": annotationsConfig(model),
                    "figure": figureConfig(model),
                    "min-value": 0.1
                }
                for model in ["solo", "transformation", "combined", "residual"]
                ],
            "plot-configs": [] if mode == "graph" else [{
                    "pisa": pisaConfig(model),
                    "coordinates":  coordsConfig(model),
                    "importance":  importanceConfig(model),
                    "predictions": predictionsConfig(model),
                    "annotations": annotationsConfig(model),
                    "figure": figureConfig(model)
                }
                for model in ["solo", "transformation", "combined", "residual"]
                ],
            "width": 7,
            "height": 6,
            "output-png": workdir + "/shap/pisa_" + tf + "_" + strand + "_" + mode + ".png"})
    ())()
"""

def getInterpConfig(tf, strand, mode):
    header = baseConfig.format(wd=WORKING_DIRECTORY, tf=tf, strand=strand, mode=mode)
    cmd = header + restConfig
    jsonPlotFname = f"{WORKING_DIRECTORY}/json/makeFigure_{tf}_{strand}_{mode}.json"
    with open(jsonPlotFname, "w") as fp:
        fp.write(cmd)
    cmds = []
    cmds.append(constructCommand("makePisaFigure") + " {0:s}".format(jsonPlotFname))
    return cmds
plotCommands = []
for tf in TF_NAMES:
    for strand in ["positive", "negative"]:
        for mode in ["plot", "graph"]:
            plotCommands.extend(getInterpConfig(tf, strand, mode))

     
slurmNameMakeInterpPlot = jobsNonGpu(SLURM_CONFIG, plotCommands, "pisaInterpPlots", 1, 20, "10:00")
jobSpecs.append([slurmNameMakeInterpPlot, [slurmNameInterpretPisa, slurmNameInterpretFlat, slurmNamePredToBigwigCombined, slurmNamePredToBigwigSolo, slurmNamePredToBigwigTransformation, slurmNameScanPostproc]])

# Interpret from fasta

In [None]:
def writeRegion(genome, outFp, regionStart):
    genomeStart = regionStart - 1046
    genomeEnd = genomeStart + INPUT_LENGTH
    seq = genome.fetch(windowChrom, genomeStart, genomeEnd)
    outFp.write(">{0:d}\n".format(regionStart))
    outFp.write(seq.upper())
    outFp.write("\n")

with open(WORKING_DIRECTORY + "/shap/interp_regions.fa", "w") as fafp, \
     open(WORKING_DIRECTORY + "/bed/interp.bed", "w") as bedfp:
    with pysam.FastaFile(GENOME_FASTA) as genome:
        for regionStart in range(windowStart, windowEnd + 1000, 1000):
            writeRegion(genome, fafp, regionStart)
            bedfp.write("chr1\t{0:d}\t{1:d}\n".format(regionStart, regionStart + 1000))

In [None]:
def makeInterpretFastaJson(tfNum, model):
    return {
        "fasta-file" : WORKING_DIRECTORY + "/shap/interp_regions.fa",
        "coordinates": {
            "bed-file" : WORKING_DIRECTORY + "/bed/interp.bed",
            "genome" : GENOME_FASTA},
        "model-file" : f"{WORKING_DIRECTORY}/models/{model}.keras",
        "input-length" : INPUT_LENGTH,
        "output-length" : OUTPUT_LENGTH,
        "heads" : len(TF_NAMES),
        "head-id": tfNum,
        "profile-task-ids" : [0,1],
        "profile-h5" : f"{WORKING_DIRECTORY}/shap/{model}_{TF_NAMES[tfNum]}_fasta_profile.h5",
        "counts-h5" : f"{WORKING_DIRECTORY}/shap/{model}_{TF_NAMES[tfNum]}_fasta_counts.h5",
        "num-shuffles" : 20,
        "verbosity" : LOG_LEVEL}

cmds = []
for model in ["joint_residual", "joint_combined", "transformation", "solo"]:
    fname = f"{WORKING_DIRECTORY}/json/shap_fasta_{model}_{TF_NAMES[0]}.json"
    cmds.append(constructCommand("interpretFlat") + " {0:s}".format(fname))
    with open(fname, "w") as fp:
        json.dump(makeInterpretFastaJson(0, model), fp)

slurmNameInterpretFastaFlat = jobsGpu(SLURM_CONFIG, cmds,
        "interpretFastaFlat", 5, 50, "1:00:00")
jobSpecs.append([slurmNameInterpretFastaFlat, [slurmNameTrainCombined]])

In [None]:

shapBwCmdBase = constructCommand("shapToBigwig") +\
          "--h5 {wd:s}/shap/{model:s}_oct4_fasta_profile.h5 " +\
          "--bw {wd:s}/shap/{model:s}_oct4_fasta_profile.bw "
shapBwCmds = []
for model in ["joint_residual", "solo"]:
    cmd = shapBwCmdBase.format(wd=WORKING_DIRECTORY, model=model)
    shapBwCmds.append(cmd)

slurmNameShapFastaToBigwig = jobsNonGpu(SLURM_CONFIG, shapBwCmds, 
           "shapFastaToBigwig", 2, 20, "1:00:00")
jobSpecs.append([slurmNameShapFastaToBigwig, [slurmNameInterpretFastaFlat]])

## Predict from fasta

In [None]:
def makePredictFastaJson(tfNum):
    return {
        "settings" : {
            "output-h5": WORKING_DIRECTORY + "/pred/from_fasta.h5",
            "batch-size" : 8,
            "heads" : len(TF_NAMES),
            "architecture": {
                "model-file" : WORKING_DIRECTORY + "/models/joint_combined.keras",
                "input-length" : INPUT_LENGTH,
                "output-length" : OUTPUT_LENGTH
            },
        },      
        "fasta-file" : WORKING_DIRECTORY + "/shap/interp_regions.fa",
        "coordinates": {
            "bed-file" : WORKING_DIRECTORY + "/bed/interp.bed",
            "genome" : GENOME_FASTA},
        "num-threads": 2,
        "verbosity" : LOG_LEVEL}

cmds = []
fname = WORKING_DIRECTORY + "/json/predict_fasta_" + TF_NAMES[0] + ".json"
cmds.append(constructCommand("makePredictions") + " {0:s}".format(fname))
with open(fname, "w") as fp:
    json.dump(makePredictFastaJson(0), fp)

slurmNamePredictFasta = jobsGpu(SLURM_CONFIG, cmds,
        "predictFasta", 5, 50, "1:00:00")
jobSpecs.append([slurmNamePredictFasta, [slurmNameTrainCombined]])

## Run the GA

In [None]:
cmdsRunGa = []
cmdsRunGa.append(("coverage run " if COLLECT_COVERAGE else "") + f"{BASE_DIRECTORY}/doc/demos/runGa.py " +\
                 f"--start 34066036 --input-len {INPUT_LENGTH} \
                   --chrom chr1 --model {WORKING_DIRECTORY}/models/joint_residual.keras --genome {GENOME_FASTA} \
                   --output {WORKING_DIRECTORY}/logs/gaOutput.json")
slurmNameRunGa = jobsGpu(SLURM_CONFIG, cmdsRunGa, "runGa", 5, 50, "0:10:00")
jobSpecs.append([slurmNameRunGa, [slurmNameTrainCombined]])

# Test the ISM mode for the interpretation tool.

In [None]:
ismScriptExe = f"{BASE_DIRECTORY}/doc/demos/testIsm.py"
def makeInterpretIsmJson(model, modelName):
    return {
        "genome" : GENOME_FASTA,
        "bed-file" : WORKING_DIRECTORY + "/bed/interp.bed",
        "model-file" : f"{WORKING_DIRECTORY}/models/{model}.keras", 
        "input-length" : INPUT_LENGTH,
        "output-length" : OUTPUT_LENGTH,
        "heads" : len(TF_NAMES),
        "head-id": tfNum,
        "profile-task-ids" : [0,1],
        "profile-h5" : f"{WORKING_DIRECTORY}/shap/ism_{TF_NAMES[tfNum]}_{modelName}_profile.h5",
        "counts-h5" : f"{WORKING_DIRECTORY}/shap/ism_{TF_NAMES[tfNum]}_{modelName}_counts.h5",
        "kmer-size" : 11,
        "verbosity" : LOG_LEVEL}
cmds = []
fname = f"{WORKING_DIRECTORY}/json/ism_{TF_NAMES[0]}_residual.json"
cmds.append(constructCommand(ismScriptExe, shortProgName="interpretISM") + "{0:s}".format(fname))
with open(fname, "w") as fp:
    json.dump(makeInterpretIsmJson("joint_residual", "residual"), fp)
slurmNameInterpretIsm = jobsGpu(SLURM_CONFIG, cmds,
        "interpretIsm", 5, 50, "10:00:00")
jobSpecs.append([slurmNameInterpretIsm, [slurmNameTrainCombined]])

# Write entire script.

In [None]:
def getScriptName(longName):
    return longName.split('/')[-1][:-6]
lastChild = ""
for e in jobSpecs:
    parents = [getScriptName(x) for x in e[1]]
    child = getScriptName(e[0])
    if len(parents) == 1 and parents[0] == lastChild:
        parents = '""'
    print("{0:25s}    {1:20s}".format(str(parents), child))
    lastChild = child
    

In [None]:
writeDependencyScript(SLURM_CONFIG, jobSpecs, "acceptance", cancelScript = WORKING_DIRECTORY + "/slurm/cancel.zsh")
if COLLECT_COVERAGE:
    finalScript=f"""#!/usr/bin/env zsh
    source /home/cm2363/.zshrc
    {SLURM_CONFIG["condaString"]}
    cat {WORKING_DIRECTORY}/logs/trainCombined* | coverage run $(which showTrainingProgress) --exit-delay 0;
    head -n 200 {WORKING_DIRECTORY}/logs/trainCombined* | coverage run $(which showTrainingProgress) --exit-delay 1;
    coverage combine --append --keep
    coverage html
    """
else:
    finalScript=f"""#!/usr/bin/env zsh
    source /home/cm2363/.zshrc
    {SLURM_CONFIG["condaString"]}
    cat {WORKING_DIRECTORY}/logs/trainCombined* | $(which showTrainingProgress) --exit-delay 0;
    head -n 200 {WORKING_DIRECTORY}/logs/trainCombined* | $(which showTrainingProgress) --exit-delay 1;
    """
with open(f"{WORKING_DIRECTORY}/slurm/atFinish.zsh", "w") as fp:
    fp.write(finalScript)

In [None]:
1/0 # STOP HERE.

# Analysis

In [None]:
#Let's also take a quick look at the generated bigwigs. 

def plotBws(bwNames, titles, chrom, start, stop):
    
    for i, bwName in enumerate(bwNames):
        plt.subplot(100*len(bwNames)+10+(i+1))
        bw = pyBigWig.open(bwName)
        bwVals = np.nan_to_num(bw.values(chrom, start, stop))
        #plt.xlim(0,stop-start)
        plt.bar(range(start, stop), bwVals, width=1)
        plt.ylabel(titles[i])
        if(i < len(bwNames)-1):
            plt.xticks([])

In [None]:
# plotBws([DATA_DIRECTORY + "/patchcap/counts.pos.bw",
#          WORKING_DIRECTORY + "/pred/patchcap_positive.bw",
#          DATA_DIRECTORY + "/patchcap/counts.neg.bw", 
#          WORKING_DIRECTORY + "/pred/patchcap_negative.bw"],
#         ["exptl_pos", "pred_pos", "exptl_neg", "pred_neg"], "chr1", 34076750, 34077750)

In [None]:
# plotBws([DATA_DIRECTORY + "/patchcap/counts.pos.bw",
#          WORKING_DIRECTORY + "/pred/transform_positive.bw",
#          DATA_DIRECTORY + "/nanog/counts.pos.bw"],
#         ["pc_pos", "transform_pos", "exptl_pos"], "chr1", 34076750, 34077750)

In [None]:
def plotTfBigwigs(tfName, exptName, startPos = 34066036, span=1000, chrom="chr1"):
    plotBws([DATA_DIRECTORY + "/" + tfName + "/counts.pos.bw",
             WORKING_DIRECTORY + "/pred/" + tfName + "_" + exptName + "_positive.bw",
             DATA_DIRECTORY + "/" + tfName + "/counts.neg.bw", 
             WORKING_DIRECTORY + "/pred/" + tfName + "_" + exptName + "_negative.bw"],
            ["exptl_pos", "pred_pos", "exptl_neg", "pred_neg"], chrom, startPos, startPos+span)

In [None]:
plotTfBigwigs('oct4', 'combined', startPos = 180924752, span=400)

In [None]:
plotTfBigwigs('nanog', 'combined', startPos = 180924752, span=400)

In [None]:
plotTfBigwigs('oct4', 'residual', startPos = 180924752, span=400)

In [None]:
def plotShapBigwigs(tfName, exptName, startPos = 34066036, span=1000, chrom="chr1"):
    plotBws([WORKING_DIRECTORY + "/pred/" + tfName + "_" + exptName + "_positive.bw",
             WORKING_DIRECTORY + "/pred/" + tfName + "_" + exptName + "_negative.bw",
             WORKING_DIRECTORY + "/shap/" + tfName + "_residual_profile.bw", 
             WORKING_DIRECTORY + "/shap/" + tfName + "_residual_counts.bw"],
            ["pred_pos", "pred_neg", "profile", "counts"], chrom, startPos, startPos+span)

In [None]:
plotShapBigwigs('nanog', 'residual', startPos = 180924752, span=400)

In [None]:
plotShapBigwigs('oct4', 'residual', startPos = 180924752, span=400)

In [None]:
#I'll generate all of those figures and save them.
runName, run = list(patternsToScan.items())[2]
clusterName, cluster = list(run.items())[0]
motif = cluster[0]
pat = motifUtils.Pattern(clusterName + "_patterns", "pattern_{0:d}".format(motif[0]), motif[1])
with h5py.File(WORKING_DIRECTORY + "/modisco/" + runName + "/modisco.h5", "r") as fp:
    pat.loadCwm(fp, 0.3, 3, bgProbs)
    pat.loadSeqlets(fp)
fig = plt.figure()
bprplots.plotModiscoPattern(pat, fig, sortKey = [x.contribMatch for x in pat.seqlets])



## PISA

In [None]:
fig = plt.figure()
pisaSection = {
    "h5-name": WORKING_DIRECTORY + "/shap/pisa_residual_nanog_positive.h5"
}

coordinatesSection = {
    "genome-fasta": GENOME_FASTA,
    "midpoint-offset": 1150,
    "input-slice-width": 200,
    "output-slice-width": 300,
    "genome-window-start": windowStart,
    "genome-window-chrom": windowChrom
}

predictionSection = {
    "bigwig-name": WORKING_DIRECTORY + "/pred/nanog_residual_positive.bw",
    "show-sequence": False,
    "color": {"tol": 0}
}

importanceSection = {
    "bigwig-name": WORKING_DIRECTORY + "/shap/nanog_residual_profile.bw",
    "show-sequence": True,
    "color": bprcolors.dnaWong
}

annotationSection = {
    "bed-name": WORKING_DIRECTORY + "/scan/nanog_residual_profile.bed",
    "custom": []
}

figureSectionPlot = {
    "left": 0.1,
    "bottom": 0.55,
    "width": 0.9,
    "height": 0.4,
    "annotation-height": 0.15,
    "tick-font-size" : 6,
    "label-font-size" : 8,
    "color-span": 0.5,
    "grid-mode": "on",
    "diagonal-mode": "on",
    "miniature": False
}

plotConfig = {
    "pisa": pisaSection,
    "coordinates": coordinatesSection,
    "importance": importanceSection,
    "predictions": predictionSection,
    "annotations": annotationSection,
    "figure": figureSectionPlot
}

figureSectionGraph = {
    "left": 0.1,
    "bottom": 0.05,
    "width": 0.9,
    "height": 0.4,
    "annotation-height": 0.15,
    "tick-font-size" : 6,
    "label-font-size" : 8,
    "color-span": 0.5
}

graphConfig = {
    "pisa": pisaSection,
    "coordinates": coordinatesSection,
    "importance": importanceSection,
    "predictions": predictionSection,
    "annotations": annotationSection,
    "figure": figureSectionGraph,
    "min-value": 0.1,
    "use-annotation-colors": True
}
rPlot = bprplots.plotPisa(plotConfig, fig)
rGraph = bprplots.plotPisaGraph(graphConfig, fig);

In [None]:
fig = plt.figure()
rGraph["config"]["figure"]["line-width"] = 2
rPlot2 = bprplots.plotPisa(rPlot["config"], fig)
rGraph2 = bprplots.plotPisaGraph(rGraph["config"], fig);

In [None]:
fig = plt.figure()
bprplots.plotPisa(rPlot2["config"], fig)
bprplots.plotPisaGraph(rGraph2["config"], fig);

In [None]:
fig = plt.figure(figsize=(10,8))
fig.add_axes([0,0,1,1])
pisaSectionCombined = {
    "h5-name": WORKING_DIRECTORY + "/shap/pisa_combined_nanog_positive.h5"
}

pisaSectionResidual = {
    "h5-name": WORKING_DIRECTORY + "/shap/pisa_residual_nanog_positive.h5"
}

pisaSectionTransformation = {
    "h5-name": WORKING_DIRECTORY + "/shap/pisa_transformation_nanog_positive.h5"
}

pisaSectionSolo = {
    "h5-name": WORKING_DIRECTORY + "/shap/pisa_solo_nanog_positive.h5"
}


coordinatesSection = {
    "genome-fasta": GENOME_FASTA,
    "midpoint-offset": 1150,
    "input-slice-width": 300,
    "output-slice-width": 500,
    "genome-window-start": windowStart,
    "genome-window-chrom": windowChrom
}

predictionSectionCombined = {
    "bigwig-name": WORKING_DIRECTORY + "/pred/nanog_combined_positive.bw"
}

predictionSectionResidual = {
    "bigwig-name": WORKING_DIRECTORY + "/pred/nanog_residual_positive.bw"
}

predictionSectionTransformation = {
    "bigwig-name": WORKING_DIRECTORY + "/pred/nanog_transformation_positive.bw"
}

predictionSectionSolo = {
    "bigwig-name": WORKING_DIRECTORY + "/pred/nanog_solo_positive.bw"
}

importanceSectionSolo = {
    "bigwig-name": WORKING_DIRECTORY + "/shap/nanog_solo_profile.bw",
    "show-sequence": True
}

importanceSectionTransformation = {
    "bigwig-name": WORKING_DIRECTORY + "/shap/nanog_transformation_profile.bw",
    "show-sequence": True
}

importanceSectionCombined = {
    "bigwig-name": WORKING_DIRECTORY + "/shap/nanog_combined_profile.bw",
    "show-sequence": True
}

importanceSectionResidual = {
    "bigwig-name": WORKING_DIRECTORY + "/shap/nanog_residual_profile.bw",
    "show-sequence": True
}



annotationSection = {
    "bed-name": WORKING_DIRECTORY + "/scan/nanog_residual_profile.bed"
}
def getFig(offX, offY):
    figureSectionPlot = {
        "left": 0.1 + offX,
        "bottom": 0.1 + offY,
        "width": 0.35,
        "height": 0.35,
        "color-span": 0.5,
        "miniature": True
    }
    return figureSectionPlot

plotConfigCombined = {
    "pisa": pisaSectionCombined,
    "coordinates": coordinatesSection,
    "importance": importanceSectionCombined,
    "predictions": predictionSectionCombined,
    "annotations": annotationSection,
    "figure": getFig(0, 0)
}

bprplots.plotPisa(plotConfigCombined, fig);

plotConfigResidual = {
    "pisa": pisaSectionResidual,
    "coordinates": coordinatesSection,
    "importance": importanceSectionResidual,
    "predictions": predictionSectionResidual,
    "annotations": annotationSection,
    "figure": getFig(0.5, 0)
}

bprplots.plotPisa(plotConfigResidual, fig);


plotConfigTransformation = {
    "pisa": pisaSectionTransformation,
    "coordinates": coordinatesSection,
    "importance": importanceSectionTransformation,
    "predictions": predictionSectionTransformation,
    "annotations": annotationSection,
    "figure": getFig(0.5, 0.5)
}

bprplots.plotPisa(plotConfigTransformation, fig);



plotConfigSolo = {
    "pisa": pisaSectionSolo,
    "coordinates": coordinatesSection,
    "importance": importanceSectionSolo,
    "predictions": predictionSectionSolo,
    "annotations": annotationSection,
    "figure": getFig(0, 0.5)
}


bprplots.plotPisa(plotConfigSolo, fig);


In [None]:
import json
from bpreveal import gaOptimize
with open(f"{WORKING_DIRECTORY}/logs/gaOutput.json", "r") as fp:
    j = json.load(fp)
    origProf = np.array(j["origProfile"])
    prof = np.array(j["profile"])

In [None]:
ptr = [(prof[x,:,0], "oskn"[x], "rgbk"[x]) for x in range(4)]
ntr = [(prof[x,:,1], "oskn"[x], "rgbk"[x]) for x in range(4)]
ax = plt.axes()
cors = gaOptimize.stringToCorruptorList(j["corruptors"])
print(cors)
cors_fix = [(x[0] + 34066036 - (INPUT_LENGTH - OUTPUT_LENGTH) //2, x[1]) for x in cors]
gaOptimize.plotTraces(ptr, ntr, range(34066036, 34067036), [], cors_fix, ax)

In [None]:
for i in range(4):
    plt.subplot(421+i*2)
    plt.plot(origProf[i,:,0], "g-")
    plt.plot(-origProf[i,:,1], "g-")
    plt.subplot(422+i*2)
    plt.plot(prof[i,:,0], "r-")
    plt.plot(-prof[i,:,1], "r-")

        

In [None]:
pisaVals = {}
def loadVals(fname):
    if fname in pisaVals:
        return np.array(pisaVals[fname])
    with h5py.File(fname, "r") as fp:
        pdats = np.sum(np.abs(np.sum(fp["shap"], axis=2)), axis=0)
    pisaVals[fname] = pdats
    return np.array(pisaVals[fname])

In [None]:
def addPlot(model, strand, color, label):
    
    pdats = loadVals(f"{WORKING_DIRECTORY}/shap/pisa_{model}_oct4_{strand}.h5")
    for tf in TF_NAMES:
        if tf != "oct4":
            pdats += loadVals(f"{WORKING_DIRECTORY}/shap/pisa_{model}_{tf}_{strand}.h5")
    xvals = np.arange(-pdats.shape[0] // 2, pdats.shape[0] // 2)
    plt.semilogy(xvals, pdats, color=color, label=label)
def makeAll():
    addPlot("combined", "positive", "#FF0000", "Combined")
    addPlot("combined", "negative", "#FF9999", None)
    addPlot("solo", "positive", "#00FF00", "Solo")
    addPlot("solo", "negative", "#99FF99", None)
    addPlot("residual", "positive", "#0000FF", "Residual")
    addPlot("residual", "negative", "#9999FF", None)
plt.subplot(212)
makeAll()
plt.legend()
plt.xlim(-20, 20)
plt.ylim(50, 5000)
plt.grid()
plt.subplot(211)
makeAll()
plt.legend()
plt.xlim(-1000, 1000)
plt.ylim(0.1, 5000)
plt.grid()
