# Parsing uniprot taxanomic division data for host information

20221115 - sarahfong


## This script does the following:
1. download uniprot taxanomic division data (.dat.gz) to current working directory
2. unzip .dat.gz file
3. parse .dat files for protein id, OH field (hosts)
4. store protein_ids, host info as dictionaries
5. write dictionary to .txt
6. rezip .dat file.


## web resources

### UNIPROT README
  https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/README

### user manual and annotation guide

https://web.expasy.org/docs/userman.html

# set up run 

In [1]:
## import modules 
import os, sys

## Functions

In [2]:
def download_uniprot(handle):
    """
    # returns local download from uniprot
    
    # checks that handle has been downloaded before running. 
    
    input 
        handle name (str)
        
    output
        absolute path (str) to local file name
        
    method
        1. check that handle name is in correct format
        2. write ftp wget command to download handle locally
        3. check that file has not already been downloaded
            if not downloaded - download file
            if downloaded - skip
        4. return absolute path to local download
        
    """
    
    #1
    if ".gz" not in handle:
        handle = handle + ".gz"
        print("updating handle name", handle)
    
    #2
    http = "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/"
    http_handle = os.path.join(http, handle)
    
    cmd = f"wget {http_handle}"
    
    local_file = os.path.join(os.getcwd(),handle)  # name for local handle
    
    #3 check if local handle exists befre downloading. 
    if os.path.exists(local_file) is False:
        os.system(cmd)
        print("downloading from ftp", http_handle)

    else:
        print("already downloaded locally", local_file)
    
    #4
    return local_file
    

In [3]:
def rezip_file(unzipped_handle):

    # if file is unzipped, rezip it
    
    unzipped_file_name = os.path.join(os.getcwd(), unzipped_handle)
    cmd = f"gzip {unzipped_file_name}"    
    
    if os.path.exists(unzipped_file_name) is True:  # only rezip if unzipped
        os.system(cmd)

    else:
        print("zipped already. Remember to unzip next time")

    zipped_file = unzipped_file_name + ".gz"
    
    return zipped_file

In [4]:
def unzip_file(zipped_file_name):
    
    # if file is zipped, unzip it
    
    if os.path.exists(zipped_file_name) is True:
        cmd = f"gunzip {zipped_file_name}"
        os.system(cmd)

    else:
        print("unzipped already. Remember to rezip")

    unzipped_file = zipped_file_name.strip(".gz")
    
    return unzipped_file
    

In [5]:
def append_dict_val_list(dict_, key, val):
    """
    # returns updated dictionary with new key, value_list or old key, updated value_list
    
    input 
        dict_ (dict)
        key (str)
        val (str)
        
    output
        updated dict_ (dict) with new key:value_list or updated key:value_list
        (1) new key:value_list
        (2) old key:value_list.append(new_value) 
        
        
    method

        1. if key not in dictionary, add new key, value_list with single value item
        2. if key in dictionary, 
            2.1 retrieve key, value_list, 
            2.2 append value to value_list, 
            2.3 update key        
        3. return updated dictionary
        
    """
    
    if key not in dict_.keys():  #1 add new key to dictionary w/ value as list of values
        dict_[key]=[val]

    elif key in dict_.keys():  #2 already existing key, append value to list of values. 
        val_list=dict_[key]   #2.1 get value list for the key
        
        if val not in val_list:
            val_list.append(val)  #2.2 append the new value to the list of values
        
            dict_[key]=val_list  #2.3 update the dictionary w/ new val list. 
        
    #3
    return dict_
        

In [6]:
def write_dict_file(dict_, filename):
    
    # writes dictionary to a text file. 
    
    with open(filename, 'w') as f:
        for key, value in dict_.items():
            value_str = ",".join(value)
            f.write(f"{key}\t{value_str}\n")

## Variables - Hard-coded 

In [7]:
"""
options for downloads

# possible taxa
possible_names = [
                    "archea", "fungi", "human", "bacteria", 
                    "mammals", "plants", "rodents", "vertebrates",
                    "invertebrates", "viruses"
                   ]
                   
# possible db
possible_db = ["sprot", "trembl"]

"""

NAME = "viruses"
DB = "sprot"
HANDLE = f"uniprot_{DB}_{NAME}.dat"

"""
Designate line w/ host info associated with each protein id. 
If no host info, ID will not be written
"""
LINE_KEY = "OH" 

# Main 

In [8]:
"""
writes 3 files with host, protein_id, and taxon_id information

method
    1. ftp download the handle (if not already downloaded)
    2. unzip the download
    3. parse the unzipped file for ID, LINE_KEY info
    4. make a bunch of dictionaries to collect data
        species_id = dict[species_taxid]:[protein_id1, protein_id2, etc.]
        id_species = dict[protein_id]:[speices_taxid1, species_taxid2, etc.]
        tax_id = dict[species_taxid]:[species name]
        
        tax_id is NCBI_taxid alpha-numeric id
        
    5. For each protein ID, 
        if LINE_KEY info - add information to dictionaries
        if no LINE_KEY info - ID is not added to dictionary
        
    6. write dictionaries as text files
    7. rezip local downloaded file
    
"""

#1 ftp download the .dat file locally
ZIPPED_DL = download_uniprot(HANDLE)  

#2 unzip the file
FILE = unzip_file(ZIPPED_DL)  

#3
with open(FILE, "r") as file:  

    #FILE_ID = FILE.split("/")[-1] # name of the file

    #4 dictionaries for collecting key, values
    species_id, id_species, taxid_name = {}, {}, {} 
    
    for line in file.readlines():
        
        line_id = line.split(" ")[0] # get the first index identifier (str)
        
        #5 parse and add data to dictionary
        if line_id == "ID":
            ID = line.split(" ")[3]  # ID value. Will update w/ each instance of ID row. 

        if line_id == LINE_KEY:  # get the OH

            SPECIES_ID = ((line.split(";")[0]).split('OH')[1]).split(" ")[-1]  #get species NCBI_taxID, plus string formatting
            SPECIES_NAME = line.split(";")[1]  #get species name

            # add to the dictionaries
            species_id = append_dict_val_list(species_id, SPECIES_ID, ID)
            id_species = append_dict_val_list(id_species, ID, SPECIES_ID)
            taxid_name = append_dict_val_list(taxid_name, SPECIES_ID, SPECIES_NAME)

#6 write dictionaries to text files
write_dict_file(species_id, os.path.join(os.getcwd(), f"{HANDLE}.species-id.txt"))
write_dict_file(id_species, os.path.join(os.getcwd(), f"{HANDLE}.id-species.txt"))
write_dict_file(taxid_name, os.path.join(os.getcwd(), f"{HANDLE}.taxid_name.txt"))

#7 rezip the download file
rezip_file(HANDLE)            

updating handle name uniprot_sprot_viruses.dat.gz


--2022-11-15 18:11:16--  https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_viruses.dat.gz
Resolving prox1 (prox1)... 172.26.1.6
Connecting to prox1 (prox1)|172.26.1.6|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 17359378 (17M) [application/x-gzip]
Saving to: ‘uniprot_sprot_viruses.dat.gz’

     0K .......... .......... .......... .......... ..........  0%  355K 48s
    50K .......... .......... .......... .......... ..........  0%  720K 35s
   100K .......... .......... .......... .......... ..........  0%  122M 24s
   150K .......... .......... .......... .......... ..........  1%  146M 18s
   200K .......... .......... .......... .......... ..........  1%  723K 19s
   250K .......... .......... .......... .......... ..........  1% 92.6M 16s
   300K .......... .......... .......... .......... ..........  2%  169M 13s
   350K .......... .......... .......... .......... ..........  2%  716K 15s
  

  5700K .......... .......... .......... .......... .......... 33% 1.87M 4s
  5750K .......... .......... .......... .......... .......... 34%  101M 3s
  5800K .......... .......... .......... .......... .......... 34% 59.6M 3s
  5850K .......... .......... .......... .......... .......... 34% 44.0M 3s
  5900K .......... .......... .......... .......... .......... 35%  759K 3s
  5950K .......... .......... .......... .......... .......... 35% 28.3M 3s
  6000K .......... .......... .......... .......... .......... 35% 70.0M 3s
  6050K .......... .......... .......... .......... .......... 35% 66.5M 3s
  6100K .......... .......... .......... .......... .......... 36% 14.8M 3s
  6150K .......... .......... .......... .......... .......... 36%  788K 3s
  6200K .......... .......... .......... .......... .......... 36% 31.8M 3s
  6250K .......... .......... .......... .......... .......... 37% 64.7M 3s
  6300K .......... .......... .......... .......... .......... 37% 64.9M 3s
  6350K ....

 11450K .......... .......... .......... .......... .......... 67%  758K 2s
 11500K .......... .......... .......... .......... .......... 68% 26.8M 2s
 11550K .......... .......... .......... .......... .......... 68%  141M 2s
 11600K .......... .......... .......... .......... .......... 68%  111M 2s
 11650K .......... .......... .......... .......... .......... 69%  194M 2s
 11700K .......... .......... .......... .......... .......... 69%  748K 2s
 11750K .......... .......... .......... .......... .......... 69% 31.5M 2s
 11800K .......... .......... .......... .......... .......... 69% 74.0M 2s
 11850K .......... .......... .......... .......... .......... 70%  126M 2s
 11900K .......... .......... .......... .......... .......... 70%  167M 1s
 11950K .......... .......... .......... .......... .......... 70%  751K 1s
 12000K .......... .......... .......... .......... .......... 71% 29.6M 1s
 12050K .......... .......... .......... .......... .......... 71%  131M 1s
 12100K ....

downloading from ftp https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_viruses.dat.gz


'/wynton/home/ahituv/fongsl/tools/db_parsing/uniprot_sprot_viruses.dat.gz'