In [54]:
###IMPORT AND CLEAN-UP UNIPROT PROTEIN NAMES FOR QUERY
import pandas as pd
import urllib.request, json

#Reads in dataframe with Epitope and UniProt Protein Name
df = pd.read_csv(r'Data/unique_names.tsv', header=None, delimiter="\t")

#Appends column names to dataframe
df.columns=['Epitope', 'UniProt_Name']

#Makes Protein Names into List
uniprot=list(df.UniProt_Name.unique())

'''
Removes all unmatched (NaN) values from list
Converts non-alphanumeric characters in UniProt Name
to their hexcode equivalent so it is URL compatible
'''
cleanedList = [urllib.parse.quote(x) for x in uniprot if str(x) != 'nan']

'''
Can only query a certain max amount of entries at a time
Divides list of protein names into sublists for iteration
'''
uniprotList=[]
while (len(cleanedList)>50):
    subList=cleanedList[0:50]
    cleanedList=cleanedList[50:]
    uniprotList.append(list(subList))
else:
    uniprotList.append(list(cleanedList))


In [55]:
#Creates empty pandas dataframe for query results to be loaded into
sub_df=pd.DataFrame()

###db2db QUERY (Iterates through each SubList)

for uniprotSubList in uniprotList:
    #Formats UniProt Names list so it is compliant with db2db API Query
    uniprotNames=','.join(uniprotSubList)

    #Parameters for db2db Query (Change if necessary)
    method='db2db'
    format_type='row'
    input_type='uniprotproteinname'
    inputValues=uniprotNames
    outputs='genesymbol'
    taxonId='9606'

    json_url = "https://biodbnet-abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.json?method{method}&format={format_type}&input={input_type}&inputValues={inputValues}&outputs={outputs}&taxonId={taxonId}".format(method=method, format_type=format_type, input_type=input_type, inputValues=inputValues, outputs=outputs, taxonId=taxonId)


    #Results imported as JSON
    with urllib.request.urlopen(json_url) as url:
        data = json.loads(url.read().decode())
        
    ###POST-PROCESSING AND CLEAN-UP OF QUERY RESULTS

    #Converts JSON to Pandas Dataframe
    json_df = pd.io.json.json_normalize(data)

    #Sets Column Headers
    json_df.columns=['Gene_Symbol', 'UniProt_Name']

    #Merges sub-query results with previous query results
    sub_df=pd.concat([sub_df, json_df], axis=0, ignore_index=True)

    #Drops duplicate entries in dataframe
    sub_df=sub_df.drop_duplicates()
    
#Merges query input from above with all query results
output=pd.merge(df, sub_df, how='inner', on='UniProt_Name')

#Drops duplicate entries in dataframe
#output=output.drop_duplicates()

output

#Writes output to csv file
# output.to_csv(r'protname_gene.csv', sep=',', encoding='utf-8', index=False, header=True)

Unnamed: 0,Epitope,UniProt_Name,Gene_Symbol
0,VPRAQGFL,Collagen alpha-1(VII) chain,COL7A1
1,VRVSWSPV,Collagen alpha-1(VII) chain,COL7A1
2,GTLHVVQR,Collagen alpha-1(VII) chain,COL7A1
3,LGTLHVVQ,Collagen alpha-1(VII) chain,COL7A1
4,LRWEPVPR,Collagen alpha-1(VII) chain,COL7A1
5,RVRVSWSP,Collagen alpha-1(VII) chain,COL7A1
6,RVSWSPVP,Collagen alpha-1(VII) chain,COL7A1
7,TLHVVQRG,Collagen alpha-1(VII) chain,COL7A1
8,VRVSWSPV,Collagen alpha-1(VII) chain,COL7A1
9,VPRAQGFL,Collagen alpha-1(VII) chain,COL7A1


In [11]:
#Formats UniProt Names list so it is compliant with db2db API Query
uniprotNames=','.join(uniprotList[10])

#Parameters for db2db Query (Change if necessary)
method='db2db'
format_type='row'
input_type='uniprotproteinname'
inputValues=uniprotNames
outputs='genesymbol'
taxonId='9606'

json_url = "https://biodbnet-abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.json?method{method}&format={format_type}&input={input_type}&inputValues={inputValues}&outputs={outputs}&taxonId={taxonId}".format(method=method, format_type=format_type, input_type=input_type, inputValues=inputValues, outputs=outputs, taxonId=taxonId)


#Results imported as JSON
with urllib.request.urlopen(json_url) as url:
    data = json.loads(url.read().decode())


###POST-PROCESSING AND CLEAN-UP OF QUERY RESULTS

#Converts JSON to Pandas Dataframe
json_df = pd.io.json.json_normalize(data)

#Sets Column Headers
json_df.columns=['Gene_Symbol', 'UniProt_Name']

json_df

Unnamed: 0,Gene_Symbol,UniProt_Name
0,FOXK2,Forkhead box protein K2
1,EPB41L5,Band 4.1-like protein 5
2,SDCCAG8,Serologically defined colon cancer antigen 8
3,NUP155,Nuclear pore complex protein Nup155
4,THPO,Thrombopoietin
5,THPO,Thrombopoietin
6,SLC30A8,Zinc transporter 8
7,AQP4,Aquaporin-4
8,AQP4,Aquaporin-4
9,SLC30A8,Zinc transporter 8


In [58]:
print(len(json_url))
json_url

1092


'https://biodbnet-abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.json?methoddb2db&format=row&input=uniprotproteinname&inputValues=Caspase-8,Tumor%20necrosis%20factor%20receptor%20superfamily%20member%206,DNA%20topoisomerase%203-beta,Probable%20monogalactosyldiacylglycerol%20synthase%2C%20chloroplastic,Probable%20DNA-directed%20RNA%20polymerase%20subunit%20delta,Proteasome%20activator%20complex%20subunit%203,SET%20domain-containing%20protein%204,DNA%20repair%20protein%20XRCC4,Immunoglobulin%20heavy%20variable%204-34,Ig%20heavy%20chain%20V%20region%20102,Unconventional%20myosin-VI,60S%20acidic%20ribosomal%20protein%20P2-B,60S%20acidic%20ribosomal%20protein%20P2-A,Zinc-type%20alcohol%20dehydrogenase-like%20protein%20YogA,Uromodulin,Noggin,Sclerostin,Envelope%20glycoprotein%20D,Gag-Pol%20polyprotein,Membrane%20protein,Jacalin-related%20lectin%2022,SusD-like%20protein%20BACOVA_02651,Casein%20kinase%20II%20subunit%20alpha,Isoleucine--tRNA%20ligase,Melanin-concentrating%20hormone%20rec

In [57]:
output.to_csv(r'protname_gene.csv', sep=',', encoding='utf-8', index=False, header=True)