20230630
sarahfong

My goal is to take bat enhancers that have been lifted over to human and found to have significant GREAT associations with nearby genes, and reverse flow. I will assess whether these bat elements are syntenic in bat genomes with the human enhancers+proximal genes linked in GREAT. 

        re-run GREAT on bat lifted to hg38 peaks. 

        liftover hg38 peaks → bat

        convert TOGA gene ortholog chain → scaffold coordinates

        compute within-scaffolds TOGA-guided intergenic regions for bats

        Assess how often human element-nearest-gene pair is the same as bat element-nearest-gene pair. 

**Nadav’s note** - selected elements that were on big scaffolds. Possible that some genes are on different scaffolds than their elements. In this case, we’re going to have to black list these regions?

In [1]:
from chr_functions import makeCoorAnnot
import config_readwrite as crw

import matplotlib.pyplot as plt
import os, sys
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# config

In [2]:
cfn_file = os.path.join(os.getcwd(), "config.bats.ini")

config, cfn = crw.read(cfn_file)

In [3]:
# peak file inputs

PATH = config["local"]['path_peak']

section = "hg38_peaks"
crw.check(config, section)

HG_PATH = os.path.join(PATH, section)

ART_KID = os.path.join(HG_PATH, "Ajam.kidney.callpeaks.hg38-lift.bed")
ART_PAN = os.path.join(HG_PATH, "Ajam.pancreas.callpeaks.revised2.cut.hg38-lift.bed")
EPT_KID = os.path.join(HG_PATH, "Efus.kidney.callpeaks.hg38-lift.bed")
EPT_PAN = os.path.join(HG_PATH, "Efus.pancreas.callpeaks.revised2.ucsc.hg38-lift.bed")

write_dict={'path':HG_PATH, 
            "ART_KID":ART_KID,
            "ART_PAN":ART_PAN,
            "EPT_KID":EPT_KID,
            "EPT_PAN":EPT_PAN,
                       }
config = crw.writeConfigDict(write_dict, config, section)

###
# chain file inputs

section = "chains"
crw.check(config, section)

CHAIN_PATH = os.path.join(PATH, section)
# from  https://hgdownload.soe.ucsc.edu/goldenPath/hg38/vsEptFus1/hg38.eptFus1.all.chain.gz
HG38_EPTFUS1 = os.path.join(CHAIN_PATH, "hg38.eptFus1.all.chain.gz")
HG38_ARTJAM = os.path.join(
    CHAIN_PATH, "hg38.chr1_22.ArtJamW.chain.final")  # from wei
ARTJAM_HG38 = os.path.join(
    CHAIN_PATH, "ArtJamW.chr1_22.hg38.chain.final")  # from wei

write_dict = {"HG38_EPTFUS1": HG38_EPTFUS1,
              "HG38_ARTJAM": HG38_ARTJAM,
              "ARTJAM_HG38": ARTJAM_HG38
              }
config = crw.writeConfigDict(write_dict, config, section)

###
# liftover script 
section = "liftover"
crw.check(config, section)

LIFTOVER_SRC = "/wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py"
config[section]["SRC"] = LIFTOVER_SRC


crw.write(config, cfn)

# format, liftover hg38 peaks file 

## functions

 ### df formatting

In [4]:
def formatBedForGreat(file, coor_id_name, formatted_filename):
    
    # name the columns
    col_names = ["#chr", "start", "end"]

    # open file as a dataframe
    df = pd.read_csv(file, sep='\t', header=None, usecols =[0,1,2], names = col_names)

    # add hg38.coor using custom script, turn .bed fields into str(chr:start-end)
    df = makeCoorAnnot(df, "#chr", "start", "end", coor_id_name)
    
    # sort the dataframe
    df = df.sort_values(by=col_names)
    
    # save formatted bed file
    if os.path.exists(formatted_filename) is False:
        df.to_csv(formatted_filename, sep='\t', index=False)
        print("wrote", formatted_filename)

    return df

### liftover peaks function

In [8]:
def liftOver(bed_file, to_build, from_build, minmatch, chainfile):

    src = "/wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py"
    cmd = " ".join([
        "python", 
           src, 
           bed_file,
           from_build, 
           to_build,
           "-m", 
            minmatch,
        "-c",
        chainfile
                    
          ])

    print(cmd)
    
    # run command in command line
    os.system(cmd)

##  params -  format, liftover 

In [6]:
BEDS = {"ART_KID":ART_KID, 
        "ART_PAN":ART_PAN, 
        "EPT_KID": EPT_KID, 
        "EPT_PAN":EPT_PAN
       }

liftover_runs = {
    "ART_KID":("hg38", "artJam2", HG38_ARTJAM), 
    "ART_PAN":("hg38", "artJam2", HG38_ARTJAM), 
    "EPT_KID":("hg38", "eptFus1", HG38_EPTFUS1), 
    "EPT_PAN":("hg38", "eptFus1", HG38_EPTFUS1)
}

BUILD = "hg38"
MINMATCH = "0.1"
section = "hg38_peaks"

## iterate through peak.bed files
 - format bed files
 - liftover formatted bed files from hg38 -> bat genome

In [9]:
for name, file in BEDS.items():
    
    # make a new file
    OUT_PATH, NAME = os.path.split(file)
    OUT_FILE= os.path.join(OUT_PATH, ".".join(NAME.split(".")[:2]) + f".peaks.clean.{BUILD}.bed")
    
    # write to config
    config[section][f"{name}.clean"]= OUT_FILE
    crw.write(config, cfn)
    
    # format the bed file
    formatBedForGreat(file, BUILD, OUT_FILE)
    
    # get the to and from builds, chainfile for this species
    from_build, to_build, chainf = liftover_runs[name]
    
    # do liftover
    liftOver(OUT_FILE, to_build, from_build, LIFTOVER_SRC, MINMATCH, chainf)

python /wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/Ajam.kidney.peaks.clean.hg38.bed hg38 artJam2 -m 0.1 -c /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/chains/hg38.chr1_22.ArtJamW.chain.final
/wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks 0.1 <class 'str'>


lifting over /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/Ajam.kidney.peaks.clean.hg38.bed from hg38 to artJam2 in /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks 


Sorting .bed /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/temp_Ajam.kidney.peaks.clean.hg38.bed
lifted this already?

 /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/Ajam.kidney.peaks.clean.hg38.liftOver.to.artJam2.bed
cleaned up temp file
python /wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py /

## add liftOver peak files to config. 

In [10]:
section = "liftover"
crw.check(config, section)

EFUS_KID_CLEAN_LIFT = os.path.join(HG_PATH, "Efus.kidney.peaks.clean.hg38.liftOver.to.eptFus1.bed")
EFUS_PAN_CLEAN_LIFT  = os.path.join(HG_PATH, "Efus.pancreas.peaks.clean.hg38.liftOver.to.eptFus1.bed")

ept_peaks = {
    "EFUS_KID_CLEAN_LIFT":EFUS_KID_CLEAN_LIFT ,
    "EFUS_PAN_CLEAN_LIFT":EFUS_PAN_CLEAN_LIFT ,
    
}

config = crw.writeConfigDict(ept_peaks, config, section)


AJAM_KID_CLEAN_LIFT  = os.path.join(HG_PATH, "Ajam.kidney.peaks.clean.hg38.liftOver.to.artJam2.bed")
AJAM_PAN_CLEAN_LIFT  = os.path.join(HG_PATH, "Ajam.pancreas.peaks.clean.hg38.liftOver.to.artJam2.bed")

ajam_peaks = {
    "AJAM_KID_CLEAN_LIFT":AJAM_KID_CLEAN_LIFT ,
    "AJAM_PAN_CLEAN_LIFT":AJAM_PAN_CLEAN_LIFT ,
    
}

config = crw.writeConfigDict(ajam_peaks, config, section)
crw.write(config, cfn)

# format scaffold files

## add to config

In [11]:
section = "scaffolds"

# make section
crw.check(config, section)

# get path to scaffolds
SCAF_PATH = config["local"]["path_scaf"]

# get scaffold mappings
AJAM_SCAF = os.path.join(
    SCAF_PATH, "GCF_014825515.1_WHU_Ajam_v2_assembly_report.txt")

EPT_SCAF = os.path.join(SCAF_PATH, "replace_efus-ucsc_with_efus-ncbi.txt")

AJAM_SCAF_CLEAN = os.path.join(
    SCAF_PATH, 'artJam2.chain.scaffold.map.clean.txt')

EPT_SCAF_CLEAN = os.path.join(
    SCAF_PATH, 'eptFus1.chain.scaffold.map.clean.txt')

scaffold_dict = {
    "AJAM_SCAF": AJAM_SCAF,
    "EPT_SCAF": EPT_SCAF,
    "AJAM_SCAF_CLEAN": AJAM_SCAF_CLEAN,
    "EPT_SCAF_CLEAN": EPT_SCAF_CLEAN,
}

config = crw.writeConfigDict(scaffold_dict, config, section)
crw.write(config, cfn)

## params

In [12]:
scaffolds={"artJam": (AJAM_SCAF, AJAM_SCAF_CLEAN),
           "eptFus":(EPT_SCAF, EPT_SCAF_CLEAN),
          }

## format scaffold files into two columns
- chain, scaffold

In [13]:
for key, value in scaffolds.items():

    infile, outfile = value # unpack tuple
    
    col_names=["#chain", "scaffold"]
    
    if key =="eptFus":
        df = pd.read_csv(infile, sep="/", header=None, usecols =[1,2], names=col_names)

    elif key == "artJam":
        df = pd.read_csv(infile, sep='\t', usecols = ["GenBank-Accn", "RefSeq-Accn"])

        # rename
        df.columns = col_names
        
    df.to_csv(outfile, sep='\t', index=False)


# map TOGA GENE ORTHOLOG chains to scaffolds 
- convert scaffold files to a .tsv format w/ two columns: chain, scaffold

## function - add scaffold, make .bed file with scaffold

In [14]:
def addScaffold(ortho_file, scaffold_file, species):
    """
    add scaffold information to TOGA ortholog chain annotations
    make into a .bed file

    input
        ortho_file (str) - TOGA gene ortholog prediction file (where chr is chain id)
        scaffold_file (str) - scaffold:chain mapping file (made above)

    method
        1. make an outfile to write .bed w/ ortholog gene coordinates AND scaffold as chromosome. 
        2. open the scaffold file
            2.1 reformat chain column for merging
        3. open ortholog file
            3.1 get gene name
        4. merge scaffold + ortholog file
            4.1 check that the ortholog file has not lost any information
        5. rearrange columns so that scaffold is in #chr position in bed file
            5.1 rename the scaffold column w a hash, 
            # drop enst annotation - duplicates of many genes
        6. write merged file if not already written.

    return
        merged (pd dataframe) - ortholog info w/ scaffold info added
        ortho_out (str) - path to written .bed file w/ ortholog + scaffold info. 

    """

    # 1 make new .bed file to write
    ortho_out = os.path.splitext(ortho_file)[0].strip(
        ".bed") + ".w.scaffold.bed"

    # 2
    scaf = pd.read_csv(scaffold_file, sep='\t')

    # 2.1
    scaf["chain"] = scaf["#chain"].apply(lambda x: x.split(".")[0])

    # 3
    ortho = pd.read_csv(ortho_file, sep='\t',
                        header=None, usecols=[0, 1, 2, 3],
                        names=["chain", "start", "end", "enst"]
                        )
    # 3.1
    ortho["gene_name"] = ortho["enst"].apply(lambda x: x.split(".")[1])

    # 4 add scaffold info via chain info
    merged = pd.merge(ortho, scaf)

    # 4.1 check that there is no data loss after merge
    print(ortho.shape, merged.shape)

    # 5 rearrange as .bed
    if species == "eptfus1":

        # drop enst annotation - keeping enst annotation produces duplicates of many genes
        merged = merged[['chain', 'start', 'end',
                         'gene_name', '#chain', 'scaffold']].drop_duplicates()  # ugh. The golden path liftover uses chain as #chr, not scaffold for eptfus1...
        merged.sort_values(by=['chain', 'start', 'end'])

        # 5.1 rename column
        merged = merged.rename(columns={"chain": "#chain",
                                        "#chain": "chain_"})

    else:
        merged = merged[['scaffold', 'start', 'end',
                         'gene_name', '#chain', 'chain']].drop_duplicates()

        # 5.1 rename column
        merged = merged.rename(columns={"scaffold": "#scaffold"})

        merged.sort_values(by=['#scaffold', 'start', 'end'])

    # 6 write
    # if os.path.exists(ortho_out) is False:
    merged.to_csv(ortho_out, sep='\t', index=False)

    return merged, ortho_out

## add scaffold info to TOGA gene ortholog prediction chains

In [15]:
# params - human only
gene_ortho = {"local_toga-human_hg38_reference":
              [('hlartjam2.geneannotation.bed.gz',AJAM_SCAF_CLEAN),
               ('eptfus1.geneannotation.bed.gz', EPT_SCAF_CLEAN)]
              }

# add scaffold info
for section, orthofiles in gene_ortho.items():

    for orthoval, scaffold_file in orthofiles:

        species = orthoval.split(".")[0]
        print(species)

        # get the gene ortholog file
        ortho_file = config[section][orthoval]
    
        # merge TOGA gene ortholog chains + scaffolds
        merged, ortho_out = addScaffold(ortho_file, scaffold_file, species)
        
        # write to config
        config[section][f"gene_scaffold_{species}"] = ortho_out
        print(ortho_out)

hlartjam2
(66067, 5) (66067, 7)
/wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/toga/HLartJam2.hg38-ref.geneAnnotation.w.scaffold.bed
eptfus1
(53938, 5) (53938, 7)
/wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/toga/eptFus1.hg38-ref.geneAnnotation.w.scaffold.bed


# intersect bat peaks with nearest TOGA ortholog gene 

## bedtools sort and closest upstream, downstream

In [16]:
int_pairs = {"efus_kid_clean_lift": "gene_scaffold_eptfus1",
             "efus_pan_clean_lift": "gene_scaffold_eptfus1",
             "ajam_kid_clean_lift": "gene_scaffold_hlartjam2",
             "ajam_pan_clean_lift": "gene_scaffold_hlartjam2",
             }

collection = {} ## collect results

for peak, ortho_gene in int_pairs.items():
    PEAK = config["liftover"][peak]
    ORTHO_GENE = config["local_toga-human_hg38_reference"][ortho_gene]

    ID = f"{peak}.x.{ortho_gene}"
    OUT= os.path.join(HG_PATH, f"{ID}.bed")

    for FILE in [PEAK, ORTHO_GENE]:

        # sort
        cmd = " ".join(["bedtools sort -i",
                        FILE,
                        "> t && mv t",
                        FILE
                        ])
        #print("\n\nsort", cmd)
        os.system(cmd)

    
    # find closest upstream, downst
    flags = ["-fu","-fd"]
    
    # bedtools closest
    for flag in flags:

        cmd = " ".join(["bedtools closest -a",
                        PEAK,
                        "-b",
                        ORTHO_GENE,
                        flag,
                        "-D 'a' >>",
                        OUT
                        ])
    #print("\n\nclosest", cmd)
    #os.system(cmd)

    print("\n\n", OUT)
    # add to collection dictionary
    collection[ID]=OUT




 /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/efus_kid_clean_lift.x.gene_scaffold_eptfus1.bed


 /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/efus_pan_clean_lift.x.gene_scaffold_eptfus1.bed


 /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/ajam_kid_clean_lift.x.gene_scaffold_hlartjam2.bed


 /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/ajam_pan_clean_lift.x.gene_scaffold_hlartjam2.bed


## write bat nearest gene section

In [17]:
# write bat nearest gene section
section = "nearest_gene"
crw.check(config, section)

config = crw.writeConfigDict(collection, config, section)
crw.write(config, cfn)

## write human nearest gene section

In [18]:
PATH_GREAT = os.path.join(HG_PATH, "great")
AJAM_KID_HU_GENE = os.path.join(PATH_GREAT, "Ajam.kidney.peaks.clean.hg38-all-region.txt")
AJAM_PAN_HU_GENE = os.path.join(PATH_GREAT, "Ajam.pancreas.peaks.clean.hg38-all-region.txt")
EFUS_KID_HU_GENE = os.path.join(PATH_GREAT, "Efus.kidney.peaks.clean.hg38.all-region.txt")
EFUS_PAN_HU_GENE = os.path.join(PATH_GREAT, "Efus.pancreas.peaks.clean.hg38-all-region.txt")

hu_nearest_gene = {"path_great":PATH_GREAT,
                   "AJAM_KID_HU_GENE":AJAM_KID_HU_GENE,
                   "AJAM_PAN_HU_GENE":AJAM_PAN_HU_GENE,
                   "EFUS_KID_HU_GENE":EFUS_KID_HU_GENE,
                   "EFUS_PAN_HU_GENE":EFUS_PAN_HU_GENE,

}
config = crw.writeConfigDict(hu_nearest_gene, config, section)
crw.write(config, cfn)

# How often do nearest genes agree between humans and bats?

In [19]:
def aggregateGenesPerPeak(df, agg_col_name):

    """
    aggregate agg_col_name into a single string col and return transformed dataframe
    
    """
    newdf = df.fillna("")
    return newdf.groupby("hg38.coor").agg({agg_col_name:lambda x: ','.join(x)} \
                                          if agg_col_name !="None" else None).reset_index()

In [20]:
def makeCoorGeneDf(df, column_list, new_col):
    """
    return dataframe of just hg38 peak coordinates + nearest gene names
    
    """
    
    # melt dataframe
    newdf = pd.melt(df, id_vars=["hg38.coor"],
                    value_vars=column_list, value_name=new_col)

    # keep only coordinates, gene names
    newdf = newdf[["hg38.coor", new_col]].drop_duplicates()

    # aggregate nearest genes
    newdf_agg = aggregateGenesPerPeak(newdf, new_col)
    
    # report results
    print(newdf.shape, newdf_agg.shape)
    print("\n\n")
    
    return newdf, newdf_agg

In [21]:
def computeSynteny(hudf, batdf):
    """
    merge human and bat nearest gene orthologs on the human peak annotation.
    Count how many rows (peak:hu-gene v. peak:bat-gene) agree/don't agree

    annotate how many hu/bat nearest gene orthologs agree

    input 
        hudf (pd dataframe): dataframe with human peak coordinates and human nearest genes
        batdf (pd dataframe): dataframe with bat peak coordinates (in hg38) and bat nearest genes

    method
        1. merge hudf and batdf on peak id
        2. compute how many peak agree on nearest hu gene, nearest bat gene name (i.e. True)
        3. compute how many peaks do not agree on nearest hu gene, nearest bat gene name (i.w.False)
        4. determine not syntenic, where peak id is in False, but not in True
        5. determine kindof syntenic, where peak id is both in False and True datasets
        6. determine syntenic, where peak id is in True dataset, but not false
    return
        not_syn (set) - peak ids that are not syntenic
        syn_kindof (set) - peak ids that have one nearest gene in common, but not two
        syn_true (set) - peak ids that have both nearest genes in common. 
        merge (pd dataframe) - dataframe of peaks + human gene + bat gene

    """
    # 1 merge bat and human nearst gene
    merge = pd.merge(hudf, batdf).drop_duplicates()

    # 2 determine if nearest genes are the same across rows
    merge["agree"] = merge['hu_nearest_gene'].str.strip().str.lower(
    ) == merge["bat_nearest_gene"].str.strip().str.lower()

    # get coordinate ids where gene is the same
    trues = set(merge.loc[merge["agree"] == True, "hg38.coor"])

    # 3 get coordinate ids where gene is different
    falses = set(merge.loc[merge["agree"] != True, "hg38.coor"])

    # 4 determine synteny from sets that are all falses, combinations of true and fale, or all true
    not_syn = falses.difference(trues)  # case where neither gene matches

    # 5 case where one gene matches, but other does not
    syn_kindof = falses.intersection(trues)

    # 6 case where one gene matches, but other does not
    syn_true = trues.difference(falses)

    pairs = [("0/2 nearest gene is the same, n=", len(not_syn)),
             ("1/2 nearest gene is the same, n=", len(syn_kindof)),
             ("2/2 nearest gene is the same, n=", len(syn_true))]
    for i, j in pairs:
        print(i, j)
    print("\n\n")

    return not_syn, syn_kindof, syn_true, merge

In [22]:
def getBatGenes(gene_set, mergedf, outfile):
    nearest_genes = mergedf.loc[mergedf["hg38.coor"].isin(gene_set) &
                               mergedf['agree']==True].drop_duplicates()
    
    nearest_genes.to_csv(outfile, sep='\t', index=False)
    
    return nearest_genes

## compute

In [23]:
peak_gene_dict = {
    "efus_kid_clean_lift.x.gene_scaffold_eptfus1": "EFUS_KID_HU_GENE",
    "efus_pan_clean_lift.x.gene_scaffold_eptfus1": "EFUS_PAN_HU_GENE",
    "ajam_kid_clean_lift.x.gene_scaffold_hlartjam2": "AJAM_KID_HU_GENE",
    "ajam_pan_clean_lift.x.gene_scaffold_hlartjam2": "AJAM_PAN_HU_GENE",

}

# per peak
for bat_, hu_ in peak_gene_dict.items():
    print(hu_)

    # read config for peak-gene files
    # bat peaks (coded as human peaks) + closest TOGA gene ortholog
    BAT = config[section][bat_]
    # human peak (coded as human peaks) + closest GREAT genes
    HU = config[section][hu_]

    # open dataframes
    bat = pd.read_csv(BAT, sep='\t', header=None, usecols=[3, 7],
                      names=["hg38.coor", "gene_name"])

    hu = pd.read_csv(HU, sep='\t', header=None, skiprows=1,
                     names=["hg38.coor", "hu_nearest_genes"])

    # format human gene pairs
    hu["gene1"] = hu["hu_nearest_genes"].apply(lambda x: x.split("(")[0])
    hu["gene2"] = hu["hu_nearest_genes"].apply(lambda x: (
        x.split(",")[1]).split("(")[0] if len(x.split(",")) > 1 else None)

    # make dataframe human nearest genes to peak

    new_col = "hu_nearest_gene"
    in_df = hu.copy()
    column_list = list(in_df.columns[2:])
    hum, hum_agg = makeCoorGeneDf(in_df, column_list, new_col)

    # make dataframe bat nearest genes to peak

    new_col = "bat_nearest_gene"
    in_df = bat.copy()
    column_list = list(in_df.columns[1:])
    batm, batm_agg = makeCoorGeneDf(in_df, column_list, new_col)

    # compute synteny
    n, m, y, merged = computeSynteny(hum, batm)

    out = os.path.join(os.path.split(
        BAT)[0], "NEAREST_MATCHED." + hu_ + "_BAT_GENE.tsv")
    print(out)

    # write the elements that match
    getBatGenes(m, merged, out)

    if len(y) > 0:
        print(y)

    # compute synteny (both upstream and downstream genes. 
    # DEMOTE - may be complicated by order of agg genes?)
    # n_, m_, y_, merged_ = computeSynteny(hum_agg, batm_agg)

EFUS_KID_HU_GENE
(283282, 2) (141641, 2)



(142879, 2) (139822, 2)



0/2 nearest gene is the same, n= 56883
1/2 nearest gene is the same, n= 82939
2/2 nearest gene is the same, n= 0



/wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/NEAREST_MATCHED.EFUS_KID_HU_GENE_BAT_GENE.tsv
EFUS_PAN_HU_GENE
(222772, 2) (111386, 2)



(112094, 2) (109751, 2)



0/2 nearest gene is the same, n= 45397
1/2 nearest gene is the same, n= 64354
2/2 nearest gene is the same, n= 0



/wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/NEAREST_MATCHED.EFUS_PAN_HU_GENE_BAT_GENE.tsv
AJAM_KID_HU_GENE
(249682, 2) (124841, 2)



(116314, 2) (113760, 2)



0/2 nearest gene is the same, n= 41704
1/2 nearest gene is the same, n= 72055
2/2 nearest gene is the same, n= 1



/wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data/peaks/hg38_peaks/NEAREST_MATCHED.AJAM_KID_HU_GENE_BAT_GENE.tsv
{'chr6:127518941-127519369'}
AJAM_PAN_HU_GENE
(368694, 2) (184347, 2)


In [24]:
merged.head()

Unnamed: 0,hg38.coor,hu_nearest_gene,bat_nearest_gene,agree
0,chr1:909993-910631,OR4F16,SAMD11,False
1,chr1:909993-910631,SAMD11,SAMD11,True
2,chr1:911149-911448,OR4F16,SAMD11,False
3,chr1:911149-911448,SAMD11,SAMD11,True
4,chr1:912914-913272,OR4F16,SAMD11,False


## test one hg

In [25]:
# in bat
bat.loc[bat["hg38.coor"].isin(m)]

Unnamed: 0,hg38.coor,gene_name
14,chr18:9121615-9121728,NDUFV2
15,chr18:9121615-9121728,NDUFV2
16,chr18:9090881-9091277,NDUFV2
17,chr18:9082660-9083600,NDUFV2
18,chr18:9073679-9074126,NDUFV2
...,...,...
210114,chr3:173396702-173396954,NLGN1
210115,chr3:173584161-173584470,NLGN1
210116,chr3:173584862-173585809,NLGN1
210117,chr3:173622121-173623020,NLGN1


In [26]:
COOR = "chr6:127518712-127519520"
print(batm.loc[batm["hg38.coor"] ==COOR], '\n\n')

print(hum.loc[hum["hg38.coor"] ==COOR]), print(hu.loc[hu["hg38.coor"] ==COOR]) 
print(bat.loc[bat["hg38.coor"]==COOR])

                      hg38.coor bat_nearest_gene
25728  chr6:127518712-127519520            SOGA3 


                       hg38.coor hu_nearest_gene
154383  chr6:127518712-127519520          SOGA3 
338743  chr6:127518712-127519520          SOGA3 
                       hg38.coor           hu_nearest_genes   gene1    gene2
154383  chr6:127518712-127519520  SOGA3 (-115), SOGA3 (+75)  SOGA3    SOGA3 
                      hg38.coor gene_name
25728  chr6:127518712-127519520     SOGA3
