## 1. Unify gene sets

In [2]:
import pandas as pd

### Load up all data sets and create an unified list of gene names

In [3]:
disgenet_df = pd.read_csv("../data/diseases/DisGeNET/tables/geneAttributes.csv", sep="\t")
drugbank_tgt_df = pd.read_excel("../parser/temp/drug_target_edges.xlsx")
string_df = pd.read_csv("../data/genes/string-db_11.5/9606.protein.info.v11.5.txt", sep="\t")

In [4]:
disgenet_gene_set = set(disgenet_df["geneName"].to_list())
drugbank_tgt_gene_set = set(drugbank_tgt_df["gene-name"].to_list())
string_df_gene_set = set(string_df["preferred_name"].to_list())

In [5]:
len(disgenet_gene_set), len(drugbank_tgt_gene_set), len(string_df_gene_set), len(set(string_df["#string_protein_id"].to_list()))

(26132, 2169, 19563, 19566)

In [6]:
disgenet_gene_set = [i.upper() for i in disgenet_gene_set]
drugbank_tgt_gene_set = [str(i).upper() for i in drugbank_tgt_gene_set]
string_df_gene_set = [i.upper() for i in string_df_gene_set]

complete_gene_set = set(disgenet_gene_set).union(set(drugbank_tgt_gene_set).union(set(string_df_gene_set)))
print(len(complete_gene_set))

29714


In [7]:
with open("./temp/gene_names.dat", "w+") as fp:
    for item in complete_gene_set:
        fp.write(item + "\n")

# 2. Map gene names with uniprotkb wherever possible

In [8]:
disgenet_df_map = pd.read_csv("../data/diseases/DisGeNET/mapa_geneid_4_uniprot_crossref.tsv", sep="\t")
disgenet_df_map.columns = ["uniprotkb-id","geneId"]
disgenet_df_map

Unnamed: 0,uniprotkb-id,geneId
0,P04217,1
1,P11245,10
2,P00813,100
3,P19022,1000
4,Q9Y243,10000
...,...,...
17035,O95758,9991
17036,Q9Y6J6,9992
17037,P98153,9993
17038,Q9UKL3,9994


In [9]:
disgenet_df2 = pd.merge(disgenet_df, disgenet_df_map, on="geneId")
disgenet_df2

Unnamed: 0,geneNID,geneId,geneName,geneDescription,pLI,DSI,DPI,uniprotkb-id
0,1,1,A1BG,alpha-1-B glycoprotein,4.991700e-09,0.700,0.538,P04217
1,2,2,A2M,alpha-2-macroglobulin,4.522900e-11,0.529,0.769,P01023
2,4,9,NAT1,N-acetyltransferase 1,1.929400e-14,0.536,0.846,P18440
3,5,10,NAT2,N-acetyltransferase 2,3.274400e-06,0.451,0.885,P11245
4,6,12,SERPINA3,serpin family A member 3,8.833000e-14,0.486,0.846,P01011
...,...,...,...,...,...,...,...,...
17035,25907,110599564,EEF1AKMT4,EEF1A lysine methyltransferase 4,,,,P0DPD7
17036,25910,110599583,EEF1AKMT4-ECE2,EEF1AKMT4-ECE2 readthrough,,0.792,0.385,P0DPD6
17037,25910,110599583,EEF1AKMT4-ECE2,EEF1AKMT4-ECE2 readthrough,,0.792,0.385,P0DPD8
17038,25931,111188157,LYNX1-SLURP2,LYNX1-SLURP2 readthrough,,0.839,0.115,P0DP58


In [10]:
disgenet_df3 = disgenet_df2[["geneId","geneName","uniprotkb-id"]]
disgenet_df3

Unnamed: 0,geneId,geneName,uniprotkb-id
0,1,A1BG,P04217
1,2,A2M,P01023
2,9,NAT1,P18440
3,10,NAT2,P11245
4,12,SERPINA3,P01011
...,...,...,...
17035,110599564,EEF1AKMT4,P0DPD7
17036,110599583,EEF1AKMT4-ECE2,P0DPD6
17037,110599583,EEF1AKMT4-ECE2,P0DPD8
17038,111188157,LYNX1-SLURP2,P0DP58


In [11]:
drugbank_prot_df = pd.read_csv(r"..\data\drugs\drugbank_5.1.10\protein_identifiers.csv")
drugbank_prot_df = drugbank_prot_df[['Gene Name','UniProt ID']]
drugbank_prot_df

Unnamed: 0,Gene Name,UniProt ID
0,ftsI,P45059
1,HDC,P19113
2,GLS2,Q9UI32
3,F13A1,P00488
4,NOS2,P35228
...,...,...
5341,CD247,P20963
5342,PMX,Q8IAS0
5343,PMX,W7JWW5
5344,F,P03420


### Get uniprot for string proteins externally from uniprot.
String uses Ensembl Protein ID

In [12]:
ensembl_prot_ids = [i for i in string_df['#string_protein_id'].to_list()]
with open("temp/string_ensembl.txt","w") as fp:
    for item in ensembl_prot_ids:
        fp.write(item + "\n")

ensembl_prot_ids = [i.split(".")[1] for i in string_df['#string_protein_id'].to_list()]

In [13]:
ensembl2uniprot_df = pd.read_excel("../data/uniprot id map/ensembl2uniprot.xlsx")
string2uniprot_df = pd.read_excel("../data/uniprot id map/string2uniprot.xlsx")
string2uniprot_df['From'] = [i.split(".")[1] for i in string2uniprot_df['From'].to_list()]
ensembl2uniprot_df = pd.concat([ensembl2uniprot_df, string2uniprot_df])
ensembl2uniprot_df

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,From,Entry,Entry Name,Gene Names,Protein names
0,ENSP00000419539,Q8IX94,CTGE4_HUMAN,CTAGE4,
1,ENSP00000474279,S4R3F8,S4R3F8_HUMAN,,
2,ENSP00000347168,P62805,H4_HUMAN,H4C1 H4/A H4FA HIST1H4A; H4C2 H4/I H4FI HIST1H...,
3,ENSP00000305847,Q96QH8,LYZL5_HUMAN,SPACA5 LYZL5 SPACA5A UNQ6288/PRO20753; SPACA5B,
4,ENSP00000431814,E9PJI5,NPIA7_HUMAN,NPIPA7,
...,...,...,...,...,...
18870,ENSP00000485629,A0A096LPI5,GVQW2_HUMAN,CCDC28A-AS1 GVQW2,Putative uncharacterized protein CCDC28A-AS1 (...
18871,ENSP00000485659,P98088,MUC5A_HUMAN,MUC5AC MUC5,Mucin-5AC (MUC-5AC) (Gastric mucin) (Major air...
18872,ENSP00000485663,Q9Y262,EIF3L_HUMAN,EIF3L EIF3EIP EIF3S6IP HSPC021 HSPC025 MSTP005,Eukaryotic translation initiation factor 3 sub...
18873,ENSP00000485668,A0A096LPK9,ORN4C_HUMAN,OR4N4C,Olfactory receptor 4N4C


In [14]:
non_mapped_ensmbl = set(ensembl_prot_ids) - set(ensembl2uniprot_df["From"].to_list())
non_mapped_ensmbl = list(non_mapped_ensmbl)

In [15]:
# import requests
# resp_dict = {}

# for i, id in enumerate(non_mapped_ensmbl):
#     reqUrl = "https://rest.ensembl.org/xrefs/id/" + str(id)

#     headersList = {
#     "Accept": "*/*",
#     "Content-Type": "application/json" 
#     }

#     payload = ""
#     response = requests.request("GET", reqUrl, data=payload,  headers=headersList)
#     if response.status_code == 200:
#         print(i, id, response.text)
#         resp_dict[id] = response.text
#     else:
#         print(i, id, response.status_code)
    
# # gets mostly 400s. Not worth my time, so just moving on. 
# import json
# with open("temp/resp.json", "w") as fp:
#     json.dump(resp_dict , fp)

In [16]:
ensembl2uniprot_df['Gene Names'] = ensembl2uniprot_df['Gene Names'].fillna(ensembl2uniprot_df['Entry'])
# ensembl2uniprot_df['Gene Names'] = [i.replace("/"," ") for i in ensembl2uniprot_df['Gene Names'].to_list()]
ensembl2uniprot_df['Gene Names'] = [i.replace(";","") for i in ensembl2uniprot_df['Gene Names'].to_list()]
ensembl2uniprot_df['Gene Names'] = [i.split(" ") for i in ensembl2uniprot_df['Gene Names'].to_list()]
ensembl2uniprot_df = ensembl2uniprot_df.explode(['Gene Names'])

In [17]:
ensembl2uniprot_df = ensembl2uniprot_df[["From","Entry","Gene Names"]]
ensembl2uniprot_df.columns = ['ensembl_id', 'uniprotkb-id', 'geneName']
ensembl2uniprot_df

Unnamed: 0,ensembl_id,uniprotkb-id,geneName
0,ENSP00000419539,Q8IX94,CTAGE4
1,ENSP00000474279,S4R3F8,S4R3F8
2,ENSP00000347168,P62805,H4C1
2,ENSP00000347168,P62805,H4/A
2,ENSP00000347168,P62805,H4FA
...,...,...,...
18872,ENSP00000485663,Q9Y262,HSPC021
18872,ENSP00000485663,Q9Y262,HSPC025
18872,ENSP00000485663,Q9Y262,MSTP005
18873,ENSP00000485668,A0A096LPK9,OR4N4C


In [18]:
# sanity check
len(set(ensembl2uniprot_df['ensembl_id'].to_list())), len(string_df_gene_set)

(19254, 19563)

## 3. Add uniprot ids against unified gene sets
Build up the string database column since that will define the network

In [19]:
ensembl2uniprot_df

Unnamed: 0,ensembl_id,uniprotkb-id,geneName
0,ENSP00000419539,Q8IX94,CTAGE4
1,ENSP00000474279,S4R3F8,S4R3F8
2,ENSP00000347168,P62805,H4C1
2,ENSP00000347168,P62805,H4/A
2,ENSP00000347168,P62805,H4FA
...,...,...,...
18872,ENSP00000485663,Q9Y262,HSPC021
18872,ENSP00000485663,Q9Y262,HSPC025
18872,ENSP00000485663,Q9Y262,MSTP005
18873,ENSP00000485668,A0A096LPK9,OR4N4C


#### Fix columns for rest of the dataframes and prepare for merge

In [20]:
disgenet_df_subset = disgenet_df3[["geneName","uniprotkb-id"]]
disgenet_df_subset

Unnamed: 0,geneName,uniprotkb-id
0,A1BG,P04217
1,A2M,P01023
2,NAT1,P18440
3,NAT2,P11245
4,SERPINA3,P01011
...,...,...
17035,EEF1AKMT4,P0DPD7
17036,EEF1AKMT4-ECE2,P0DPD6
17037,EEF1AKMT4-ECE2,P0DPD8
17038,LYNX1-SLURP2,P0DP58


In [21]:
temp_df = pd.merge(left=ensembl2uniprot_df, right=disgenet_df_subset, on='uniprotkb-id', how='left')

In [22]:
temp_df['geneName_y'] = temp_df['geneName_y'].fillna(temp_df['geneName_x'])
temp_df['gene_names'] = list(zip(temp_df['geneName_x'], temp_df['geneName_y']))
temp_df = temp_df.explode(['gene_names'])
temp_df = temp_df.drop_duplicates()
temp_df = temp_df[["ensembl_id","uniprotkb-id","gene_names"]]
temp_df

Unnamed: 0,ensembl_id,uniprotkb-id,gene_names
0,ENSP00000419539,Q8IX94,CTAGE4
1,ENSP00000474279,S4R3F8,S4R3F8
2,ENSP00000347168,P62805,H4C1
2,ENSP00000347168,P62805,H4C9
3,ENSP00000347168,P62805,H4C1
...,...,...,...
62616,ENSP00000485663,Q9Y262,EIF3L
62617,ENSP00000485663,Q9Y262,MSTP005
62617,ENSP00000485663,Q9Y262,EIF3L
62618,ENSP00000485668,A0A096LPK9,OR4N4C


In [23]:
drugbank_prot_df.columns = ["uniprotkb-id","gene_names"]
drugbank_prot_df

Unnamed: 0,uniprotkb-id,gene_names
0,ftsI,P45059
1,HDC,P19113
2,GLS2,Q9UI32
3,F13A1,P00488
4,NOS2,P35228
...,...,...
5341,CD247,P20963
5342,PMX,Q8IAS0
5343,PMX,W7JWW5
5344,F,P03420


In [24]:
temp_df2 = pd.merge(temp_df,drugbank_prot_df,on='uniprotkb-id',how='left')
temp_df2 = temp_df2[['ensembl_id','uniprotkb-id','gene_names_x']]
temp_df2.columns = ['ensembl_id','uniprotkb_id','gene_names']
temp_df2 = temp_df2.drop_duplicates()
temp_df2

Unnamed: 0,ensembl_id,uniprotkb_id,gene_names
0,ENSP00000419539,Q8IX94,CTAGE4
1,ENSP00000474279,S4R3F8,S4R3F8
2,ENSP00000347168,P62805,H4C1
3,ENSP00000347168,P62805,H4C9
6,ENSP00000347168,P62805,H4C4
...,...,...,...
103440,ENSP00000485663,Q9Y262,HSPC021
103442,ENSP00000485663,Q9Y262,HSPC025
103444,ENSP00000485663,Q9Y262,MSTP005
103446,ENSP00000485668,A0A096LPK9,OR4N4C


In [37]:
temp_df2['is_known_drug_tgt'] = [i in drugbank_tgt_df['uniprotkb-id'].to_list() for i in temp_df2['uniprotkb_id'].to_list()]
temp_df2['is_disease_associated'] = [i in disgenet_df3['uniprotkb-id'].to_list() for i in temp_df2['uniprotkb_id'].to_list()]
temp_df2

Unnamed: 0,ensembl_id,uniprotkb_id,gene_names,is_known_drug_tgt,is_disease_associated
0,ENSP00000419539,Q8IX94,CTAGE4,False,False
1,ENSP00000474279,S4R3F8,S4R3F8,False,False
2,ENSP00000347168,P62805,H4C1,False,True
3,ENSP00000347168,P62805,H4C9,False,True
6,ENSP00000347168,P62805,H4C4,False,True
...,...,...,...,...,...
103440,ENSP00000485663,Q9Y262,HSPC021,False,True
103442,ENSP00000485663,Q9Y262,HSPC025,False,True
103444,ENSP00000485663,Q9Y262,MSTP005,False,True
103446,ENSP00000485668,A0A096LPK9,OR4N4C,False,False


In [38]:
temp_df2.to_csv("temp/gene_nodes_identifiers.tsv", sep="\t", index=False)

## 4. Filter edges

In [39]:
gene_edges = pd.read_csv("../data/genes/string-db_11.5/9606.protein.links.detailed.v11.5.txt", sep=" ")
gene_edges['protein1'] = [i.split(".")[1] for i in gene_edges['protein1'].to_list()]
gene_edges['protein2'] = [i.split(".")[1] for i in gene_edges['protein2'].to_list()]
gene_edges

Unnamed: 0,protein1,protein2,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,ENSP00000000233,ENSP00000379496,0,0,0,54,0,0,144,155
1,ENSP00000000233,ENSP00000314067,0,0,0,0,180,0,61,197
2,ENSP00000000233,ENSP00000263116,0,0,0,62,152,0,101,222
3,ENSP00000000233,ENSP00000361263,0,0,0,0,161,0,64,181
4,ENSP00000000233,ENSP00000409666,0,0,0,82,213,0,72,270
...,...,...,...,...,...,...,...,...,...,...
11938493,ENSP00000485678,ENSP00000354800,0,0,0,213,0,0,0,213
11938494,ENSP00000485678,ENSP00000308270,0,0,0,152,0,0,0,151
11938495,ENSP00000485678,ENSP00000335660,0,0,0,182,0,0,0,181
11938496,ENSP00000485678,ENSP00000300127,0,0,0,155,0,0,0,154


In [43]:
ensembl_prot_ids_final = list(set(temp_df2['ensembl_id'].to_list()))

In [44]:
temp_e_df = gene_edges[gene_edges['protein1'].isin(ensembl_prot_ids_final)]
temp_e_df = temp_e_df[temp_e_df['protein2'].isin(ensembl_prot_ids_final)]
temp_e_df.columns = ['src', 'dst', 'neighborhood', 'fusion', 'cooccurence',
       'coexpression', 'experimental', 'database', 'textmining',
       'combined_score']
temp_e_df

Unnamed: 0,src,dst,neighborhood,fusion,cooccurence,coexpression,experimental,database,textmining,combined_score
0,ENSP00000000233,ENSP00000379496,0,0,0,54,0,0,144,155
1,ENSP00000000233,ENSP00000314067,0,0,0,0,180,0,61,197
2,ENSP00000000233,ENSP00000263116,0,0,0,62,152,0,101,222
3,ENSP00000000233,ENSP00000361263,0,0,0,0,161,0,64,181
4,ENSP00000000233,ENSP00000409666,0,0,0,82,213,0,72,270
...,...,...,...,...,...,...,...,...,...,...
11938493,ENSP00000485678,ENSP00000354800,0,0,0,213,0,0,0,213
11938494,ENSP00000485678,ENSP00000308270,0,0,0,152,0,0,0,151
11938495,ENSP00000485678,ENSP00000335660,0,0,0,182,0,0,0,181
11938496,ENSP00000485678,ENSP00000300127,0,0,0,155,0,0,0,154


In [45]:
temp_e_df.to_csv("temp/gene_edges.tsv", sep="\t", index=False)