## 1. Unify gene sets

In [1]:
import pandas as pd

In [2]:
disgenet_df = pd.read_csv("../data/diseases/DisGeNET/tables/geneAttributes.csv", sep="\t")
drugbank_tgt_df = pd.read_excel("../parser/temp/drug_target_edges.xlsx")
string_df = pd.read_csv("../data/genes/9606.protein.info.v11.5.txt", sep="\t")

In [3]:
disgenet_gene_set = set(disgenet_df["geneName"].to_list())
drugbank_tgt_gene_set = set(drugbank_tgt_df["gene-name"])
string_gene_set = set(string_df["preferred_name"].to_list())

In [4]:
print(len(disgenet_gene_set), len(drugbank_tgt_gene_set), len(string_gene_set))

26132 2169 19563


In [5]:
complete_gene_set = string_gene_set.union(disgenet_gene_set.union(drugbank_tgt_gene_set))
print(len(complete_gene_set))

cgs = []
for g in complete_gene_set:
    if type(g) == str:
        cgs.append(g.upper())
len(set(cgs))

complete_gene_set = set(cgs)

29730


In [6]:
with open("./temp/gene_names.dat", "w+") as fp:
    for item in complete_gene_set:
        fp.write(item + "\n")

# 2. map gene names with uniprotkb wherever possible

In [7]:
disgenet_df_map = pd.read_csv("../data/diseases/DisGeNET/mapa_geneid_4_uniprot_crossref.tsv", sep="\t")
disgenet_df_map.columns = ["uniprotkb-id","geneId"]
disgenet_df_map

Unnamed: 0,uniprotkb-id,geneId
0,P04217,1
1,P11245,10
2,P00813,100
3,P19022,1000
4,Q9Y243,10000
...,...,...
17035,O95758,9991
17036,Q9Y6J6,9992
17037,P98153,9993
17038,Q9UKL3,9994


In [8]:
disgenet_df2 = pd.merge(disgenet_df, disgenet_df_map, on="geneId")
disgenet_df2

Unnamed: 0,geneNID,geneId,geneName,geneDescription,pLI,DSI,DPI,uniprotkb-id
0,1,1,A1BG,alpha-1-B glycoprotein,4.991700e-09,0.700,0.538,P04217
1,2,2,A2M,alpha-2-macroglobulin,4.522900e-11,0.529,0.769,P01023
2,4,9,NAT1,N-acetyltransferase 1,1.929400e-14,0.536,0.846,P18440
3,5,10,NAT2,N-acetyltransferase 2,3.274400e-06,0.451,0.885,P11245
4,6,12,SERPINA3,serpin family A member 3,8.833000e-14,0.486,0.846,P01011
...,...,...,...,...,...,...,...,...
17035,25907,110599564,EEF1AKMT4,EEF1A lysine methyltransferase 4,,,,P0DPD7
17036,25910,110599583,EEF1AKMT4-ECE2,EEF1AKMT4-ECE2 readthrough,,0.792,0.385,P0DPD6
17037,25910,110599583,EEF1AKMT4-ECE2,EEF1AKMT4-ECE2 readthrough,,0.792,0.385,P0DPD8
17038,25931,111188157,LYNX1-SLURP2,LYNX1-SLURP2 readthrough,,0.839,0.115,P0DP58


In [9]:
disgenet_df3 = disgenet_df2[["geneId","geneName","uniprotkb-id"]]
disgenet_df3

Unnamed: 0,geneId,geneName,uniprotkb-id
0,1,A1BG,P04217
1,2,A2M,P01023
2,9,NAT1,P18440
3,10,NAT2,P11245
4,12,SERPINA3,P01011
...,...,...,...
17035,110599564,EEF1AKMT4,P0DPD7
17036,110599583,EEF1AKMT4-ECE2,P0DPD6
17037,110599583,EEF1AKMT4-ECE2,P0DPD8
17038,111188157,LYNX1-SLURP2,P0DP58


In [17]:
non_annot_genes = string_gene_set - disgenet_gene_set.union(drugbank_tgt_gene_set)

In [18]:
with open("./temp/gene_names.dat","w+") as fp:
    for item in non_annot_genes:
        fp.write(item + "\n")


In [44]:
import requests
from tqdm import tqdm
import time
base_url="https://rest.uniprot.org/uniprotkb/"

with open("non_annot_genes.dat","w+") as fp:
    for idx,item in tqdm(enumerate(list(non_annot_genes))):
        url = base_url + "search?query=gene:" + item + "+AND+organism_id:9606&format=tsv&fields=accession,id,gene_names"
        resp = requests.get(url).text
        fp.write(resp)
        if idx%100==0:
            time.sleep(60)



400it [08:46,  1.41it/s]

In [42]:
list(non_annot_genes).index(item)

2717