In [9]:
import plot_params as pp
from chr_functions import makeCoorAnnot
import config_readwrite as crw

import matplotlib.pyplot as plt
import os
import sys
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pp.fonts()

('sans-serif', 'Arial', 18)

In [10]:
cfn_file = os.path.join(os.getcwd(), "config.bat2.ini")

config, cfn = crw.read(cfn_file)

# FUNCTIONS

In [11]:
def makeBed(file):
    BED = os.path.splitext(file)[0] + ".bed"

    df = pd.read_csv(file, sep='\t',
                     usecols=[0, 1, 2, 3],
                     names=['#chr', "start", "end", "peak_id"])
    df.to_csv(BED, sep='\t', index=False)

    return BED

In [4]:
def chainSwap(file):
    """swap hg38->eptFus1 chaing to eptFus1->hg38 chain"""
    cmd = "/wynton/home/ahituv/fongsl/bin/ucsc_exe/chainSwap /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/chain/hg38.eptFus1.all.chain.gz /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/chain/eptFus1ToHg38.chain.gz"
    os.system(cmd)

In [5]:
def liftOver(bed_file, to_build, from_build, minmatch, chainfile):

    src = "/wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py"
    cmd = " ".join([
        "python",
        src,
        bed_file,
        from_build,
        to_build,
        "-m",
        minmatch,
        "-c",
        chainfile

    ])

    print(cmd)

    # run command in command line
    os.system(cmd)

In [6]:
def eptFusMapChainScaf(efus_regions, efus_mapping_file):
    """
    problem - UCSC liftover chains use alignment chain, not scaffold name. 
            Efus regions are in scaffold coordinates, but need to be put into chain coordinates
            or else, regions cannot be liftedOver
    solution
        Translate efus scaffold annotations to chain annotations. 
    
    input 
        efus_regions (str) - file with efus peaks
        mapping (str) - file with mapping between efus scaffold and chain
        
    method
        1. open data as pd dataframes
        2. merge on scaffold column
        3. rearrange column order to make file bed-like with chains in the #chr position
        4. re-save data with the chain in the chromosome position

    return
        region_chains (pd.dataframe) - dataframe with chains annotated in chromosome position of bed file. 
    """
    #1
    df=pd.read_csv(efus_regions, sep='\t')

    if "#chain" not in list(df):
        scaf = pd.read_csv(efus_mapping_file,sep='\t', names=["#chain", "#chr"])
    
        #2
        region_chains = pd.merge(df, scaf, how="left")

        #3
        region_chains = region_chains[['#chain', 'start', "end", "peak_id", "#chr"]]
        
        #4
        region_chains.to_csv(efus_regions, sep='\t', index=False)
    else:
        print('already swapped scaffold for chain for liftOver purposes')
        region_chains=df

    return region_chains
    

In [35]:
def keepReciprocalPeaks(hg38_file, reciprocal_file):
    """keep only the peaks that are found in both hg38 and bat genomes after reciprocal liftover"""

    rec_df = pd.read_csv(reciprocal_file, sep='\t', header=None, usecols=[
                         3])  # set of reciprocal identifiers
    rec = set(rec_df[3])
    print(len(rec))

    hg38 = pd.read_csv(hg38_file, sep='\t', header=None)
    print(hg38.shape)

    hg38_clean = hg38.loc[hg38[3].isin(rec)].copy()

    CLEAN_FILE = os.path.split(hg38_file)[0] + ".clean.bed"

    # sort and write file to clean file
    hg38_clean.sort_values(by=[0, 1, 2]).to_csv(
        CLEAN_FILE, sep='\t', index=False)

    return CLEAN_FILE

## PARAMS

In [36]:
sections = [
    ('efus', 'eptfus_hg38', 'hg38_eptfus'),
    ('ajam', 'artjam_hg38', 'hg38_artjam'),         
           ]

tissues=['liver', 'pectr', 'smint',
        #'liver_promoter', 'pectr_promoter', 'smint_promoter'
        ]

liftover_runs = {
    "ajam":("artJam2", 'hg38'), 
    "efus":("eptFus1", 'hg38'),   
}

formal_names ={
    "ajam":("Artibeus_jamaicensis","fruit_a500"), 
    "efus":('Eptesicus_fuscus', "insect_a500")
}

BUILD = "hg38"
MINMATCH = "0.1"
section = "hg38_peaks"

In [37]:
for section, chain1, chain2 in sections:
    for tissue in tissues:
        
        FILE = config[section][tissue]
        CHAIN1 = config["chain"][chain1]  # bat -> hu
        CHAIN2 = config["chain"][chain2]  # hu -> bat
        
        # get naming conventions
        formal_name, conv_name = formal_names[section]
        PATH = os.path.dirname(FILE)
        
        # make a bed file (only 4 columns for liftover)
        BED = makeBed(FILE)

        if section == "efus" and "promoter" not in tissue:
            """change efus scaffolds to chains"""
            
            # file that maps scaffold to chains
            EFUS_SCAFFOLD=config["scaffolds"]["ept_scaf_clean"]
            
            eptFusMapChainScaf(BED, EFUS_SCAFFOLD)
            
         # write to config
        config[section][f"{tissue}.bed"]= BED
        
         # get the to and from builds, 
        from_build, to_build = liftover_runs[section]
        
        # do liftover
        liftOver(BED, to_build, from_build, MINMATCH, CHAIN1)
        
        # get liftover filename
        LIFTOVER_FILE = os.path.join(PATH, f"{conv_name}.liftOver.to.{to_build}.bed")
        print(LIFTOVER_FILE)
        
        # write liftover to config
        config[section][f"{tissue}.to.hg38"] = LIFTOVER_FILE
        
        # do reciprocal liftover
        liftOver(LIFTOVER_FILE, from_build, to_build, MINMATCH, CHAIN2)
        
        # write reciprocal liftOver to config
        LIFTOVER_FILE2 = os.path.join(PATH, f"{conv_name}.liftOver.to.hg38.liftOver.to.{from_build}.bed")

        config[section][f"{tissue}.to.{to_build}.to.{from_build}"] = LIFTOVER_FILE2
        
        RECIP_FILE = keepReciprocalPeaks(LIFTOVER_FILE, LIFTOVER_FILE2)
        
        config[section][f"{tissue}.to.{to_build}.reciprocal"] = RECIP_FILE

# write new beds to config
crw.write(config, cfn)

python /wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Eptesicus_fuscus/liver/insect_a500.bed eptFus1 hg38 -m 0.1 -c /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/chain//eptFus1ToHg38.chain.gz
/wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Eptesicus_fuscus/liver 0.1 <class 'str'>


lifting over /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Eptesicus_fuscus/liver/insect_a500.bed from eptFus1 to hg38 in /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Eptesicus_fuscus/liver 


Sorting .bed /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Eptesicus_fuscus/liver/temp_insect_a500.bed
lifted this already?

 /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Eptesicus_fuscus/liver/insec

/wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Artibeus_jamaicensis/liver 0.1 <class 'str'>


lifting over /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Artibeus_jamaicensis/liver/fruit_a500.liftOver.to.hg38.bed from hg38 to artJam2 in /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Artibeus_jamaicensis/liver 


Sorting .bed /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Artibeus_jamaicensis/liver/temp_fruit_a500.liftOver.to.hg38.bed
lifted this already?

 /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Artibeus_jamaicensis/liver/fruit_a500.liftOver.to.hg38.liftOver.to.artJam2.bed
cleaned up temp file
7664
(8644, 4)
python /wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py /wynton/home/ahituv/fongsl/other_analyses/for-hai_bats/data/CallPeaks_results_for_Sarah/Artibeus_jamaicen