In [15]:
# importing the requests library 
import requests 
import pandas as pd
from random import sample 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns                                                             
import sys 
from multiprocessing.pool import Pool
import multiprocessing 

API-Options = { "phages": "https://phagesdb.org/api/phages/",
                "clusters": "https://phagesdb.org/api/clusters/",
                "subclusters": "https://phagesdb.org/api/subclusters/",
                "institutions": "https://phagesdb.org/api/institutions/",
                "host_strains": "https://phagesdb.org/api/host_strains/",
                "host_species": "https://phagesdb.org/api/host_species/",
                "host_genera": "https://phagesdb.org/api/host_genera/",
                "publications": "https://phagesdb.org/api/publications/",
                "genes": "https://phagesdb.org/api/genes/",
                "pham_phages": "https://phagesdb.org/api/pham_phages/"
             }

Download all the genes from a specific page number. This helps because then I can thread by 

In [16]:
def download_all_genes(page_number):
    ''' Download page number from https://phagesdb.org/api/genes/?page= given global param
    '''
    complete_url = "https://phagesdb.org/api/genes/?page=" + str(page_number) + "&page_size=" + str(page_size)
    response = requests.get(url = complete_url).json()
    list_of_genes = []
    for gene in response["results"]:
        list_of_genes.append([gene["GeneID"],
#                              gene["PhageID"]["HostStrain"],
#                              gene["PhageID"]["Cluster"],
                             gene["phams"][0],
                             gene["Notes"].lower(),
                             gene["translation"],
                             gene["Orientation"]])
    
    return list_of_genes

In [17]:
pages = "all" # Can download all of genes by using pages = "all"
global page_size
page_size = 100# Equally distribute amount threads page_size = "optimal"

if page_size == "optimal":
    complete_url = "https://phagesdb.org/api/genes/?page=1&page_size=1"
    count = int(requests.get(url = complete_url).json()["count"])
    page_size = int(count/multiprocessing.cpu_count())
    
if pages == "all":
    complete_url = "https://phagesdb.org/api/genes/?page=1&page_size=" + str(page_size)
    count = int(requests.get(url = complete_url).json()["count"])
    pages = int(np.ceil(count/page_size))
    
with Pool(multiprocessing.cpu_count()) as p:
    genes = p.map(download_all_genes, list(range(1,pages+1))) #, [page_size for _ in range(pages)]

combined_genes = [gene for genes_groups in genes for gene in genes_groups]

print("Finished download! Found ", len(combined_genes))

Finished download! Found  348734


In [18]:
combined_genes[0]

['20ES_CDS_1',
 '36676',
 '',
 'MYGTRSSAFWASQPGKFDVLNLRMTFPSTSAHEIPDLTATDFVPENLAAWNMPRHREYAAHTGGALHFFLDDYRFETVWSSPERLLDRVKAVGAALTPDFSLWKDMPRAAQVWNTYRSRWCGAYWQSEGIEVIPTVGWGTPDTYDFCFDGLPTGGNVAISCLTLRAKQEDRELFTRGVQELVWRTQPKTLLVYGRLRFCEDIDLPEVREYPTYWDRRRKRLEEQWESAGAAVEAVEPPAPRPETKEPQLQAVDLD',
 'F']

In [19]:
# parse out name and gene number from gene id row
for i in range(len(combined_genes)):
    info = combined_genes[i][0].split("_")
    phage_name = info[0]
    gene_num = info[-1]
#     combined_genes[i][3] = combined_genes[i][3][0]
    combined_genes[i].append(phage_name)
    combined_genes[i].append(gene_num)
    

In [20]:
# Lets check out one of the genes collected
combined_genes[0]

['20ES_CDS_1',
 '36676',
 '',
 'MYGTRSSAFWASQPGKFDVLNLRMTFPSTSAHEIPDLTATDFVPENLAAWNMPRHREYAAHTGGALHFFLDDYRFETVWSSPERLLDRVKAVGAALTPDFSLWKDMPRAAQVWNTYRSRWCGAYWQSEGIEVIPTVGWGTPDTYDFCFDGLPTGGNVAISCLTLRAKQEDRELFTRGVQELVWRTQPKTLLVYGRLRFCEDIDLPEVREYPTYWDRRRKRLEEQWESAGAAVEAVEPPAPRPETKEPQLQAVDLD',
 'F',
 '20ES',
 '1']

In [21]:
# Clean Function Names
import pickle
a_file = open("data/new_conversion_table.pkl", "rb")
conversion_table = pickle.load(a_file)

df_approved_functions = pd.read_csv("data/Approved_Functions.csv")
df_approved_functions = df_approved_functions.dropna(subset=["Approved Function"])
df_approved_functions.head()

approved_functions = list(df_approved_functions["Approved Function"])
approved_functions = [i.lower() for i in approved_functions]
approved_functions

copy = combined_genes.copy()
for i in combined_genes:
    function = i[2]
#     i[-1] = function
    i.append(function)
    if function in approved_functions:
        continue
    elif function in conversion_table.keys():
        if  conversion_table[function] != -1:
            i[2] = conversion_table[function]
        else: 
            i[2] = "NKF"
    elif function == None:
        i[2] = "NKF"
    else: 
        i[2] = "NKF"

print(combined_genes[:4])

[['20ES_CDS_1', '36676', 'NKF', 'MYGTRSSAFWASQPGKFDVLNLRMTFPSTSAHEIPDLTATDFVPENLAAWNMPRHREYAAHTGGALHFFLDDYRFETVWSSPERLLDRVKAVGAALTPDFSLWKDMPRAAQVWNTYRSRWCGAYWQSEGIEVIPTVGWGTPDTYDFCFDGLPTGGNVAISCLTLRAKQEDRELFTRGVQELVWRTQPKTLLVYGRLRFCEDIDLPEVREYPTYWDRRRKRLEEQWESAGAAVEAVEPPAPRPETKEPQLQAVDLD', 'F', '20ES', '1', ''], ['20ES_CDS_10', '39578', 'lysin b', 'MSLQVGSSGELVNRWIRVMKARFASYAGKLKEDGYFGLDDKAVQQEYETRTHQTPDGIVTDGDLAYLLPRKPWLFTVHGTGMPDPLGPGLPADVARDVLDIYNWQPIGNYPAAAFPMKPSYDKAIAELVLQIDQKLAGNNDEFSMAGYSQGAIAVAYVLKHEILDPKGRLHKYVRRLKKVVMWGNPMRQKGFAHFDEWIHPVAAPDTMGILEDRLENLEWAMQEYGFEVRDYAHDGDMYASIKEDDMHEYEVAIGRIVMTVKGFYGGKDSVVAQLGELAGHPLRESIAMARAIIDAISFLAKSTQGEKWPHLYNRYPAVAFLRQP', 'F', '20ES', '10', 'lysin b'], ['20ES_CDS_11', '34196', 'terminase', 'MSLENHHPELAPSPPHIIGPSWQRTVDGSWHLPDPKMTLGWGVLKWLSDYVNTPGGHDDPARLKFLIELSEAGLLENENMFIPTDEQVRLVLWWYAVDEKGQYVYREGVIRRLKGWGKDPFTAALCLAELCGPVAFSHFDRETGQAIGKRRPAPWVTVAAVSQDQTKNTFSLFPVMISKKLKAEFKLEVNRFIIYAEGGGRIEAATSSPASMEGNRPTFVVQNETQWWGQGPDGKVNEGHSMAETIEGNMT

Create dataframe from data collected above

In [22]:
df_genes = pd.DataFrame(combined_genes, columns =['gene ID',
                                            'pham',
                                            'function',
                                            'translation',
                                            'orientation',
                                            'phage',
                                            'gene number',
                                            'uncleaned function'
                                           ]) 
df_genes

Unnamed: 0,gene ID,pham,function,translation,orientation,phage,gene number,uncleaned function
0,20ES_CDS_1,36676,NKF,MYGTRSSAFWASQPGKFDVLNLRMTFPSTSAHEIPDLTATDFVPEN...,F,20ES,1,
1,20ES_CDS_10,39578,lysin b,MSLQVGSSGELVNRWIRVMKARFASYAGKLKEDGYFGLDDKAVQQE...,F,20ES,10,lysin b
2,20ES_CDS_11,34196,terminase,MSLENHHPELAPSPPHIIGPSWQRTVDGSWHLPDPKMTLGWGVLKW...,F,20ES,11,terminase
3,20ES_CDS_12,39511,portal protein,MTAPLPGQEEIPDPAIARDEMISAFDDAVKNLKINTSYYEAERRPE...,F,20ES,12,portal protein
4,20ES_CDS_13,21454,capsid maturation protease,MITAAVAAYVQRFASMFTGPALSLGEWARFLQTLFPEVQRRYAQAA...,F,20ES,13,capsid maturation protease
...,...,...,...,...,...,...,...,...
348729,ZygoTaiga_CDS_95,39534,NKF,MSVCANPECGKEFEQPNKYRTTKTCSKECRYAVSASTTKASSGRWE...,F,ZygoTaiga,95,zinc-finger dna binding domain
348730,ZygoTaiga_CDS_96,2891,NKF,MSTYTYPASPAQIRFINTLLAERDVPQASRDYVAHLLDTGISSKRA...,F,ZygoTaiga,96,
348731,ZygoTaiga_CDS_97,38955,NKF,MTLPGPPNTPPGADSPAWPPVVSANGYVPPESIPTRTTVTSVEGRS...,F,ZygoTaiga,97,
348732,ZygoTaiga_CDS_98,22971,NKF,MWIDDFQGEKVNIEDADEVSNEKLVEMEREYFSHLIDPHTARYWGK...,F,ZygoTaiga,98,


In [23]:
df_genes.to_csv("data/cleaned_gene_list.csv",index=False)


In [24]:
all_responses = []
for phage in df_genes['phage'].unique():
    complete_url = "https://phagesdb.org/api/phages/"+ str(phage)
    response = requests.get(url = complete_url).json()
    all_responses.append(response)

In [25]:
print(len(all_responses))

3513


In [26]:
phage_meta_data = []
i = 0
for response in all_responses:
#     debugging
#     print(i, len(all_responses[i:]))
#     i = i+1
    if len(response.keys())<5:
        continue
    phage_meta_data.append([response['phage_name'],
                            response["pcluster"]["temperate"]  if "pcluster" in response.keys() and response["pcluster"]  != None else "",
                            response["pcluster"]["cluster"] if "pcluster" in response.keys() and response["pcluster"] != None else response["pcluster"],
                            response["psubcluster"]["subcluster"] if "psubcluster" in response.keys() and response["psubcluster"] != None else response["psubcluster"],
                            response["morphotype"],
                            response["isolation_host"]["genus"],
                            response["isolation_host"]["species"],
                            response["genome_length"],
                            response['is_annotated'],
                            response['is_phamerated'],
                            response["gcpercent"]
                           ])
#     print(phage_meta_data[-1])
#     if len(phage_meta_data)>3:
#         break
                           
df_phage = pd.DataFrame(phage_meta_data, columns =['phage',
                                                  'temperate',
                                                  'cluster',
                                                  'subcluster',
                                                  'morphotype',
                                                  'host genus',
                                                  'host species',
                                                  'genome length',
                                                  'is annotated',
                                                  'is phamerated', 
                                                  'gcpercent'
                                               ]) 

df_phage.to_csv("data/phage_metadata.csv",index=False)
df_phage.head()

Unnamed: 0,phage,temperate,cluster,subcluster,morphotype,host genus,host species,genome length,is annotated,is phamerated,gcpercent
0,20ES,True,A,A2,SIPHO,Mycobacterium,smegmatis,53124,False,True,63.4
1,244,True,E,,SIPHO,Mycobacterium,smegmatis,74483,True,True,63.4
2,32HC,True,Z,,SIPHO,Mycobacterium,smegmatis,50781,False,True,65.7
3,39HC,False,B,B6,SIPHO,Mycobacterium,smegmatis,71565,False,True,70.0
4,40AC,True,A,A17,SIPHO,Mycobacterium,smegmatis,53396,False,True,63.3


In [None]:
len(df_phage["phage"])

In [None]:
len(df_phage[df_phage["is annotated"]])

In [None]:
len(df_phage[df_phage["is annotated"]==False])

In [None]:
all_responses[229]

In [None]:
df.Cluster.unique()

In [None]:
df.Cluster.value_counts()

In [None]:
df.describe()

Trying out multithreading for data download

In [None]:
global x
x = []


def download(i):
    return i

with Pool(multiprocessing.cpu_count()) as p:
    x = p.map(download, ([1,2], [2,3]))
x

In [None]:
import multiprocessing 
print(multiprocessing.cpu_count()) 
# query number of processors avaible