In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import numpy as np
import keras
import tensorflow
from matplotlib import pyplot as plt
import matplotlib as mpl
import pandas as pd
import pybedtools as pbt
import os
from collections import defaultdict
%matplotlib inline

Using TensorFlow backend.


In [3]:
### Global variables
ROOT = "/users/kcochran/projects/domain_adaptation/"

tfs = ["CTCF", "CEBPA", "Hnf4a", "RXRA"]
tfs_latex_names = ["CTCF", "CEBPα", "HNF4α", "RXRα"]

all_trainspecies = ["mm10", "hg38"]
model_names_dict = {"mm10" : "Mouse", "hg38" : "Human"}

import random
random.seed(1234) 

SKIP = None
MODEL_TYPE = "best"

REPEAT_TYPES = ["DNA", "LINE", "Low_complexity", "LTR", "RC", "Retroposon",
                "RNA", "rRNA", "Satellite", "scRNA", "Simple_repeat", "SINE",
                "snRNA", "srpRNA", "tRNA", "Unknown"]

In [4]:
# needed to load DA models

from flipGradientTF import GradientReversal

def custom_loss(y_true, y_pred):
    y_pred = tensorflow.boolean_mask(y_pred, tensorflow.not_equal(y_true, -1))
    y_true = tensorflow.boolean_mask(y_true, tensorflow.not_equal(y_true, -1))
    return keras.losses.binary_crossentropy(y_true, y_pred)

In [5]:
### File and Model Loading


def get_test_bed_file(species):
    # awk -v OFS="\t" '{ print $1, $2, $3 }' [get_test_bed_seq_file(species)]
    return(ROOT + "data/" + species + "/chr2.bed")


def get_alu_intersect_file_chr2():
    # awk '$1 == "chr2"' [get_alu_file()] > rmsk_alus_chr2.bed
    # bedtools intersect -a [get_test_bed_file(species)] -b rmsk_alus_chr2.bed -u -sorted > chr2_alus_intersect.bed
    return(ROOT + "data/hg38/chr2_alus_intersect.bed")


def get_repeat_intersect_file_chr2(repeat_type):
    # see make_repeat_files.sh
    return(ROOT + "data/hg38/repeats/chr2_" + repeat_type + "_intersect.bed")


def get_model_file(tf, train_species, run = 1, model_type = MODEL_TYPE):
    try:
        run_int = int(run)
    except:
        print("Error: You need to pass in a run number.")
    
    model_file_prefix = ROOT + "/".join(["models", tf, train_species + "_trained", "basic_model/"])
    
    if model_type == "earlystop":
        model_file_suffix = "_run" + str(run) + "_earlystop.model"
    elif model_type == "best":
        model_file_suffix = "_run" + str(run) + "_best.model"
    else:
        model_file_suffix = "_run" + str(run) + "_15E_end.model"
    
    files = [f for f in os.listdir(model_file_prefix) if f.endswith(model_file_suffix)]
    latest_file = max([model_file_prefix + f for f in files], key=os.path.getctime)
    return latest_file


def load_keras_model(model_file, DA = False):
    if DA:
        model = keras.models.load_model(model_file,
                        custom_objects = {"GradientReversal" : GradientReversal,
                                          "custom_loss" : custom_loss})
    else:
        model = keras.models.load_model(model_file)
    return model


def get_models_all_runs(tf, train_species):
    files = [get_model_file(tf, train_species, run + 1) for run in range(5)]
    return [load_keras_model(f) for f in files]

In [6]:
from keras.utils import Sequence
from seqdataloader.batchproducers.coordbased.core import Coordinates
from seqdataloader.batchproducers.coordbased.coordstovals.fasta import PyfaidxCoordsToVals

ROOT = "/users/kcochran/projects/domain_adaptation/"

GENOMES = {"mm10" : "/users/kcochran/genomes/mm10_no_alt_analysis_set_ENCODE.fasta",
            "hg38" : "/users/kcochran/genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"}


def get_test_bed_file(species, tf):
    return(ROOT + "data/" + species + "/" + tf + "/chr2.bed")


class UnboundTestGenerator(Sequence):
    def __init__(self, batchsize, val_file, skip = None):
        self.valfile = val_file
        self.get_steps(skip, batchsize)
        if "hg38" in val_file:
            self.converter = PyfaidxCoordsToVals(GENOMES["hg38"])
        else:
            assert False  # not expecting to use this
            self.converter = PyfaidxCoordsToVals(GENOMES["mm10"])
        self.batchsize = batchsize
        self.get_unbound_coords(skip)
        
        
    def get_steps(self, skip, batchsize):
        with open(self.valfile) as f:
            lines_in_file = sum([1 for line in f if line.rstrip().split()[-1] == "0"])
        if skip is None:
            self.steps = lines_in_file // batchsize
        else:
            self.steps = (lines_in_file // skip) // batchsize


    def __len__(self):
        return self.steps


    def get_unbound_coords(self, skip):
        coords = []
        line_count = 0
        with open(self.valfile) as f:
            for line in f:
                line_split = line.rstrip().split()
                if line_split[-1] == "0":
                    if skip is None or line_count % skip == 0:
                        coords.append(line_split[:3])
                    line_count += 1
        self.coords = [Coordinates(c[0], int(c[1]), int(c[2])) for c in coords]


    def __getitem__(self, batch_index):
        batch = self.coords[batch_index * self.batchsize : (batch_index + 1) * self.batchsize]
        return self.converter(batch)

        
class BoundTestGenerator(Sequence):
    def __init__(self, batchsize, val_file):
        self.valfile = val_file
        self.get_steps(batchsize)
        if "hg38" in val_file:
            self.converter = PyfaidxCoordsToVals(GENOMES["hg38"])
        else:
            assert False  # not expecting to use this
            self.converter = PyfaidxCoordsToVals(GENOMES["mm10"])
        self.batchsize = batchsize
        self.get_bound_coords()
        
        
    def get_steps(self, batchsize):
        with open(self.valfile) as f:
            lines_in_file = sum([1 for line in f if line.rstrip().split()[-1] == "1"])
        self.steps = lines_in_file // batchsize


    def __len__(self):
        return self.steps
    
        
    def get_bound_coords(self):
        with open(self.valfile) as f:
            coords_tmp = [line.split()[:3] for line in f if line.rstrip().split()[-1] == "1"]
            self.coords = [Coordinates(c[0], int(c[1]), int(c[2])) for c in coords_tmp]
            assert len(coords_tmp) > 0


    def __getitem__(self, batch_index):
        batch = self.coords[batch_index * self.batchsize : (batch_index + 1) * self.batchsize]
        assert len(batch) > 0
        return self.converter(batch)
    

def get_preds_batched_fast(model, batch_size, bound, val_file, skip = None):
    if bound:
        generator = BoundTestGenerator(batch_size, val_file)
    else:
        generator = UnboundTestGenerator(batch_size, val_file, skip)
    return np.squeeze(model.predict_generator(generator,
                                              use_multiprocessing = True,
                                              workers = 8, verbose = 1))


def get_avg_preds_on_seqs(models, bound, val_file, skip = None):
    all_preds = [get_preds_batched_fast(model, 1024, bound, val_file, skip = skip) for model in models]
    avg_preds = np.mean(np.array(all_preds), axis = 0)
    return avg_preds

In [7]:
### Alu functions

def get_window_starts(bed_file, skip = None, retain_bound = False):
    window_starts = []
    line_count = 0
    with open(bed_file) as f:
        for line in f:
            line_split = line.rstrip().split()
            if skip is None:
                window_starts.append(int(line_split[1]))
            else:
                if line_split[-1] == "0":
                    if line_count % skip == 0:
                        window_starts.append(int(line_split[1]))
                    line_count += 1
                elif retain_bound and line_split[-1] == "1":
                    window_starts.append(int(line_split[1]))
    return window_starts


def get_bound_window_starts(bed_file):
    window_starts = []
    with open(bed_file) as f:
        for line in f:
            line_split = line.rstrip().split()
            if line_split[-1] == "1":
                window_starts.append(int(line_split[1]))
    return window_starts


def get_unbound_window_starts(bed_file, skip = None):
    window_starts = []
    line_count = 0
    with open(bed_file) as f:
        for line in f:
            line_split = line.rstrip().split()
            if line_split[-1] == "0":
                if skip is None:
                    window_starts.append(int(line_split[1]))
                elif line_count % skip == 0:
                    window_starts.append(int(line_split[1]))
                line_count += 1
    return window_starts


def matches_across_sorted_lists(list_a, list_b):
    matches = []
    b_index = 0
    for a_item in list_a:
        while True:
            if b_index >= len(list_b):
                matches.append(False)
                break
            if list_b[b_index] > a_item:
                matches.append(False)
                break
            elif list_b[b_index] == a_item:
                matches.append(True)
                b_index += 1
                break
            else:
                b_index += 1
    return matches


def get_alu_intersect(tf, skip = None):
    test_bed = get_test_bed_file("hg38", tf)
    unbound_windows = get_unbound_window_starts(test_bed, skip)
    bound_windows = get_bound_window_starts(test_bed)
    alu_intersect_bed = get_alu_intersect_file_chr2()
    alu_windows = get_window_starts(alu_intersect_bed, None, retain_bound = True)
    unbound_labels = matches_across_sorted_lists(unbound_windows, alu_windows)
    bound_labels = matches_across_sorted_lists(bound_windows, alu_windows)
    return { "unbound" : unbound_labels, "bound" : bound_labels }



def get_repeat_intersect(tf, repeat_type, skip = None):
    test_bed = get_test_bed_file("hg38", tf)
    unbound_windows = get_unbound_window_starts(test_bed, skip)
    bound_windows = get_bound_window_starts(test_bed)
    repeat_intersect_bed = get_repeat_intersect_file_chr2(repeat_type)
    repeat_windows = get_window_starts(repeat_intersect_bed, None, retain_bound = True)
    unbound_labels = matches_across_sorted_lists(unbound_windows, repeat_windows)
    bound_labels = matches_across_sorted_lists(bound_windows, repeat_windows)
    return { "unbound" : unbound_labels, "bound" : bound_labels }


In [None]:
preds_dict = defaultdict(lambda: defaultdict(lambda: dict()))

for tf in tfs:
    print("\n=====", tf, "=====\n")
    
    val_file = get_test_bed_file("hg38", tf)
    
    for train_species in all_trainspecies:
        print("Loading models...")
        models = get_models_all_runs(tf, train_species)
    
        print("Predicting on bound sequences with " + train_species + "-trained models...")
        preds_dict["bound"][tf][train_species] = get_avg_preds_on_seqs(models, True, val_file)
        print("Predicting on unbound sequences with " + train_species + "-trained models...")
        preds_dict["unbound"][tf][train_species] = get_avg_preds_on_seqs(models, False, val_file,
                                                                        skip = SKIP)
        del train_species, models
        keras.backend.clear_session()
del tf


===== CTCF =====

Loading models...
Predicting on bound sequences with mm10-trained models...
Predicting on unbound sequences with mm10-trained models...
Loading models...
Predicting on bound sequences with hg38-trained models...
Predicting on unbound sequences with hg38-trained models...

===== CEBPA =====

Loading models...
Predicting on bound sequences with mm10-trained models...
Predicting on unbound sequences with mm10-trained models...
Loading models...
Predicting on bound sequences with hg38-trained models...
Predicting on unbound sequences with hg38-trained models...

===== Hnf4a =====

Loading models...
Predicting on bound sequences with mm10-trained models...
Predicting on unbound sequences with mm10-trained models...
Loading models...
Predicting on bound sequences with hg38-trained models...
Predicting on unbound sequences with hg38-trained models...

In [16]:
def calc_row_info(preds_dict, alu_labels, train_species, tf):
    species1, species2 = train_species
    model1_unbound_preds = preds_dict["unbound"][tf][species1]
    model2_unbound_preds = preds_dict["unbound"][tf][species2]
    model1_bound_preds = preds_dict["bound"][tf][species1]
    model2_bound_preds = preds_dict["bound"][tf][species2]

    alu_labels["unbound"] = alu_labels["unbound"][:len(model1_unbound_preds)]
    alu_labels["bound"] = alu_labels["bound"][:len(model1_bound_preds)]
    
    bound_frac_alu = sum(alu_labels["bound"]) / len(alu_labels["bound"])
    unbound_frac_alu = sum(alu_labels["unbound"]) / len(alu_labels["unbound"])
    
    bound_model1_FNs = 0 
    bound_model1_FN_alus = 0
    bound_both_models_FNs = 0 
    bound_both_models_FN_alus = 0
    for i in range(len(model1_bound_preds)):
        model1_pred = model1_bound_preds[i]
        model2_pred = model2_bound_preds[i]
        if model2_pred - model1_pred > 0.5:
            bound_model1_FNs += 1
            if alu_labels["bound"][i]:
                bound_model1_FN_alus += 1
        if model2_pred < 0.5 and model1_pred < 0.5:
            bound_both_models_FNs += 1
            if alu_labels["bound"][i]:
                bound_both_models_FN_alus += 1
    model1_FN_frac_alu = bound_model1_FN_alus / bound_model1_FNs
    both_models_FN_frac_alu = bound_both_models_FN_alus / bound_both_models_FNs
    
    unbound_model1_FPs = 0 
    unbound_model1_FP_alus = 0
    unbound_both_models_FPs = 0 
    unbound_both_models_FP_alus = 0
    for i in range(len(model1_unbound_preds)):
        model1_pred = model1_unbound_preds[i]
        model2_pred = model2_unbound_preds[i]
        if model1_pred - model2_pred > 0.5:
            unbound_model1_FPs += 1
            if alu_labels["unbound"][i]:
                unbound_model1_FP_alus += 1
        if model1_pred > 0.5 and model2_pred > 0.5:
            unbound_both_models_FPs += 1
            if alu_labels["unbound"][i]:
                unbound_both_models_FP_alus += 1
    print(bound_model1_FN_alus)
    print(bound_model1_FNs)
    print(bound_both_models_FN_alus)
    print(bound_both_models_FNs)
    model1_FP_frac_alu = unbound_model1_FP_alus / unbound_model1_FPs
    both_models_FP_frac_alu = unbound_both_models_FP_alus / unbound_both_models_FPs
    
    return bound_frac_alu, both_models_FN_frac_alu, model1_FN_frac_alu, unbound_frac_alu, both_models_FP_frac_alu, model1_FP_frac_alu
    
    
def quantify_repeat_overlap(preds_dict, repeat_type, train_species = ["mm10", "hg38"]):
    rows = {}
    for tf in tfs:
        repeat_labels = get_repeat_intersect(tf, repeat_type)
        row = calc_row_info(preds_dict, repeat_labels, train_species, tf)
        print(row)
        rows[tf] = row
    return rows


repeat_overlaps = dict()
for repeat_type in REPEAT_TYPES:
    print(repeat_type)
    repeat_overlaps[repeat_type] = quantify_repeat_overlap(preds_dict, repeat_type)

DNA
146
1216
285
2452
(0.10074869791666667, 0.11623164763458402, 0.12006578947368421, 0.11368269473668569, 0.07221022113580808, 0.07930834366509272)
0
5
130
1187
(0.12265625, 0.10951979780960404, 0.0, 0.113554826843044, 0.12978160317444015, 0.09526093958811636)
70
605
417
3656
(0.10773689516129033, 0.11405908096280087, 0.11570247933884298, 0.11366112769473559, 0.09841542362306463, 0.09035030964044737)
306
2794
961
8631
(0.10141674440298508, 0.1113428339705712, 0.10952040085898354, 0.1138185705490149, 0.10216150958373989, 0.09406507582726437)
LINE
303
1216
545
2452
(0.17964680989583334, 0.22226753670473084, 0.24917763157894737, 0.3760703645090347, 0.14601216531180716, 0.24785670948851196)
0
5
305
1187
(0.255390625, 0.2569502948609941, 0.0, 0.37565421559155765, 0.2900511156075499, 0.32296770951124515)
186
605
927
3656
(0.21046496975806453, 0.2535557986870897, 0.3074380165289256, 0.37617592268639827, 0.21847772698912787, 0.2974304464368241)
597
2794
2443
8631
(0.20824102145522388, 0.28304

In [13]:
def print_table(rows, header = None, row_order = None):
    print(r'\begin{table*}[!pb]')
    print(r'\vspace{-10pt}')
    print(r'\processtable{Caption.\label{Tab:01}} {\setlength{\tabcolsep}{1.2em}\begin{tabular}{@{}c|cccccc@{}}\toprule')
    
    if header is None:
        header = "TF & Bound Sites & FNs (Both Models) & FNs (Mouse Only) & Unbound Sites & FPs (Both Models) & FPs (Mouse Only)"
    print(header + r' \\\midrule')
    
    if row_order is None:
        row_order = list(rows.keys())
    last_row = row_order[-1]
    for row_key in row_order:
        row_as_str = ["%0.1f" % (100 * num) + r'\%' for num in rows[row_key]]
        row_as_str[-1] = r'\textbf{' + row_as_str[-1] + r'}'
        tf_fancy_name = tfs_latex_names[tfs.index(row_key)]
        if row_key is not last_row:
            print(tf_fancy_name + " & " + " & ".join(row_as_str) + r' \\')
        else:
            print(tf_fancy_name + " & " + " & ".join(row_as_str) + r' \\\botrule')
        
    #row_as_str = ["%0.1f" % (100 * num) + r'\%' for num in rows[row_order[-1]]]
    #tf_fancy_name = tfs_latex_names[tfs.index(row_order[-1])]
    
    print(r'\end{tabular}}{}')
    print(r'\vspace*{-10pt}')
    print(r'\end{table*}')
    
for repeat_type in REPEAT_TYPES:
    print("\n\n" + repeat_type + "\n")
    print_table(repeat_overlaps[repeat_type])



DNA

\begin{table*}[!pb]
\vspace{-10pt}
\processtable{Caption.\label{Tab:01}} {\setlength{\tabcolsep}{1.2em}\begin{tabular}{@{}c|cccccc@{}}\toprule
TF & Bound Sites & FNs (Both Models) & FNs (Mouse Only) & Unbound Sites & FPs (Both Models) & FPs (Mouse Only) \\\midrule
CTCF & 10.1\% & 11.6\% & 12.0\% & 11.4\% & 7.2\% & \textbf{7.9\%} \\
CEBPα & 12.3\% & 11.0\% & 0.0\% & 11.4\% & 13.0\% & \textbf{9.5\%} \\
HNF4α & 10.8\% & 11.4\% & 11.6\% & 11.4\% & 9.8\% & \textbf{9.0\%} \\
RXRα & 10.1\% & 11.1\% & 11.0\% & 11.4\% & 10.2\% & \textbf{9.4\%} \\\botrule
\end{tabular}}{}
\vspace*{-10pt}
\end{table*}


LINE

\begin{table*}[!pb]
\vspace{-10pt}
\processtable{Caption.\label{Tab:01}} {\setlength{\tabcolsep}{1.2em}\begin{tabular}{@{}c|cccccc@{}}\toprule
TF & Bound Sites & FNs (Both Models) & FNs (Mouse Only) & Unbound Sites & FPs (Both Models) & FPs (Mouse Only) \\\midrule
CTCF & 18.0\% & 22.2\% & 24.9\% & 37.6\% & 14.6\% & \textbf{24.8\%} \\
CEBPα & 25.5\% & 25.7\% & 0.0\% & 37.6\% & 29.0\% &