# overview
20230609

SarahFong

R2.3 comment - The homology and functional consistency of cross-species genes need to be supported by sufficient data, especially the homology and functional consistency of protein-coding genes in the collinear regions of human and bat genomes needs to be proved.

##### My plan - check for orthology of genes between humans and bats using pre-computed TOGA predictions from Hiller lab and Zoonomia consortium. 

    Inputs
        - Wei's data: gene names from hg38. Not sure why there is not transcript information for the transcripts. 
        - TOGA predictions - see README for full info - https://genome.senckenberg.de//download/TOGA/README.txt
            ./orthologsClassification.tsv  # classification of one:one, one:many, many:many orthologs predicted from TOGA using hg38 reference gene annotations and chain files between hg38 and Eptesicus_fuscus__big_brown_bat__eptFus1
            ./loss_summ_data.tsv  # classification of Intact, Partially Intact, Uncertain loss, Missing, Partially missing. 
            ./geneAnnotation.bed.gz # annotation locations
        
        ## HUMAN AND MOUSE REFERENCE
## Here, I'm going to set up my config with the local directories, the TOGA HTTP addresses and download TOGA classifications. 
        

In [1]:
import config_readwrite as crw
import os, sys

# config

In [3]:
cfn_file = os.path.join(os.getcwd(), "config.bats.ini")

config, cfn = crw.read(cfn_file)

In [2]:
def writeConfigDict(in_dict, config, section):
    """
    write dictionary key and values to config section
    
    input 
        in_dict (dict) - dictionary of key value pairs to write
        config (config file)
        section (str) - name of section
        
    method
        1. parse key value pairs, write to config
    
    return 
        config
    """
    
    for key, value in in_dict.items():
        config[section][key]=value

    return config

# set up local dir

In [4]:
# set up config
section = "local"
crw.check(config, section)

# instantiate paths
PATH = os.path.join(os.getcwd(), "data")
RE = os.path.join(os.getcwd(), "results")
PATH_TOGA = os.path.join(os.getcwd(), "data", "toga")
PATH_SUPP = os.path.join(os.getcwd(), "data", "supp")
PATH_PEAK = os.path.join(os.getcwd(), "data", "peaks")
PATH_CHAIN = os.path.join(os.getcwd(), "data", "chain")
PATH_SCAF = os.path.join(os.getcwd(), "data", "scaffold_annot")

# prepare paths for config
write_config = {"results": RE, 
                "PATH":PATH,
                "PATH_TOGA":PATH_TOGA, 
                 "PATH_SUPP":PATH_SUPP, 
                 "PATH_PEAK":PATH_PEAK, 
                 "PATH_CHAIN":PATH_CHAIN, 
                 "PATH_SCAF":PATH_SCAF, 
                
               }

# write paths to config
config = crw.writeConfigDict(write_config, config, section)

In [5]:
# make directories for results

CREATE_LIST = [PATH, RE]
for dir_ in CREATE_LIST:
    
    if os.path.exists(dir_) is False:
        print("making dir", dir_)
        os.mkdir(dir_)
        
    else:
        print("already made", dir_)

already made /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/data
already made /wynton/home/ahituv/fongsl/other_analyses/for-wei_bats/results


# downloads

## function - wget dowload to local

In [6]:
def downloadHttp(http_address, species_ref, path):

    species = (http_address.split("__")[-1]).split("/")[0]  # get the species.
    address, handle = os.path.split(http_address)  # get the http file handle

    # get the local file handle with ref species annotated
    local_handle = species + "." + species_ref + "-ref." + handle
    
    # file to write (in data file)
    local_file = os.path.join(path, local_handle)

    if os.path.exists(local_file) is False:
        print("download", local_handle)
        os.system(f"wget {http_address} \
        --no-check-certificate \
        && mv {handle} {local_file}")  # download and rewrite filename w/ species
        
    else:
        print('already downloaded', local_handle)

    return local_file, species, handle

## download per reference

In [7]:
REFS = {"mm10": "mouse_mm10_reference",
        "hg38": "human_hg38_reference"}

for SPECIES, REF in REFS.items():
    print(REF)

    # write references to config
    section = f"TOGA-{REF}"
    crw.check(config, section)

    # eptFus1
    HTTP = f"https://genome.senckenberg.de//download/TOGA/{REF}/Chiroptera/"
    EPTFUS1_DIR = "Eptesicus_fuscus__big_brown_bat__eptFus1"
    EPTFUS1_CLASS = os.path.join(
        HTTP, EPTFUS1_DIR, 'orthologsClassification.tsv.gz')
    EPTFUS1_PRED = os.path.join(HTTP, EPTFUS1_DIR, 'loss_summ_data.tsv.gz')
    EPTFUS1_GENE_PRED = os.path.join(
        HTTP, EPTFUS1_DIR, 'geneAnnotation.bed.gz')

    # artJam1
    ARTJAM1_DIR = "Artibeus_jamaicensis__Jamaican_fruit-eating_bat__HLartJam1"
    ARTJAM1_CLASS = os.path.join(
        HTTP, ARTJAM1_DIR, 'orthologsClassification.tsv.gz')
    ARTJAM1_PRED = os.path.join(HTTP, ARTJAM1_DIR, 'loss_summ_data.tsv.gz')

    # artJam2
    ARTJAM2_DIR = "Artibeus_jamaicensis__Jamaican_fruit-eating_bat__HLartJam2"
    ARTJAM2_CLASS = os.path.join(
        HTTP, ARTJAM2_DIR, 'orthologsClassification.tsv.gz')
    ARTJAM2_PRED = os.path.join(HTTP, ARTJAM2_DIR, 'loss_summ_data.tsv.gz')
    ARTJAM2_GENE_PRED = os.path.join(
        HTTP, ARTJAM2_DIR, 'geneAnnotation.bed.gz')

    # addresses to download
    http_addresses = {"http": HTTP,
                      "EPTFUS1_DIR": EPTFUS1_DIR,
                      "EPTFUS1_CLASS": EPTFUS1_CLASS,
                      "EPTFUS1_PRED": EPTFUS1_PRED,
                      "EPTFUS1_GENE_PRED": EPTFUS1_GENE_PRED,

                      "ARTJAM1_DIR": ARTJAM1_DIR,
                      "ARTJAM1_CLASS": ARTJAM1_CLASS,
                      "ARTJAM1_PRED": ARTJAM1_PRED,

                      "ARTJAM2_DIR": ARTJAM2_DIR,
                      "ARTJAM2_CLASS": ARTJAM2_CLASS,
                      "ARTJAM2_PRED": ARTJAM2_PRED,
                      "ARTJAM2_GENE_PRED": ARTJAM2_GENE_PRED,
                      }

    # write these balues to the config
    config = crw.writeConfigDict(http_addresses, config, section)

    # make local toga section
    section = f"local_toga-{REF}"
    crw.check(config, section)
    config[section]["path"] = PATH_TOGA

    # download TOGA data

    DL_LIST = [EPTFUS1_CLASS, EPTFUS1_PRED, EPTFUS1_GENE_PRED,
               ARTJAM2_CLASS, ARTJAM2_PRED, ARTJAM2_GENE_PRED
               ]

    local_toga = {}
    for http_address in DL_LIST:

        # download http
        local_file, species, handle = downloadHttp(http_address, SPECIES, PATH_TOGA)

        # add to dictionary
        local_toga[f"{species}.{handle}"] = local_file

    # write to config
    config = crw.writeConfigDict(local_toga, config, section)

mouse_mm10_reference
already downloaded eptFus1.mm10-ref.orthologsClassification.tsv.gz
already downloaded eptFus1.mm10-ref.loss_summ_data.tsv.gz
already downloaded eptFus1.mm10-ref.geneAnnotation.bed.gz
already downloaded HLartJam2.mm10-ref.orthologsClassification.tsv.gz
already downloaded HLartJam2.mm10-ref.loss_summ_data.tsv.gz
already downloaded HLartJam2.mm10-ref.geneAnnotation.bed.gz
human_hg38_reference
already downloaded eptFus1.hg38-ref.orthologsClassification.tsv.gz
already downloaded eptFus1.hg38-ref.loss_summ_data.tsv.gz
already downloaded eptFus1.hg38-ref.geneAnnotation.bed.gz
already downloaded HLartJam2.hg38-ref.orthologsClassification.tsv.gz
already downloaded HLartJam2.hg38-ref.loss_summ_data.tsv.gz
already downloaded HLartJam2.hg38-ref.geneAnnotation.bed.gz


# write wei supp

In [8]:
section = "wei_supp"

crw.check(config, section)
SUPP = os.path.join(PATH_SUPP, "Gordon_supp.xlsx")
SUPP_TABLE2 = os.path.join(PATH_SUPP, "supp.table2.csv")
SUPP_TABLE11 = os.path.join(PATH_SUPP, "supp.table11.csv")
SUPP_TABLE5 = os.path.join(PATH_SUPP, "supp.table5.csv")

config[section]["path"]=PATH_SUPP
config[section]["supp"]=SUPP
config[section]["fast_fed_deg"] = SUPP_TABLE2
config[section]["panc_deg"] = SUPP_TABLE11
config[section]["kidn_deg"] = SUPP_TABLE5

# write to config

crw.write(config, cfn)