In [48]:
import json
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm

# Target

In [52]:
folder_ = "../data/ot/targets/"

In [60]:
def read_json(infile):
    with open(infile, encoding="utf8") as f:
        doc = json.loads(f.read())
    return doc

def get_input_files(infolder, keyword = "part-"):
    return sorted(glob(infolder+keyword+"*"))

In [51]:
columns = ["id", "approvedSymbol", "functionDescriptions","synonyms", "nameSynonyms", "subcellularLocations", "targetClass", "pathways"]
targets_lol = [] # targets list of lists

infiles = get_input_files(infolder)

for infile in tqdm(infiles):
    with open(infile, encoding="utf8") as f:
        for line in f.readlines()[:10]:
            ent = json.loads(line)
            l = []
            for c in columns:

                if c in ent:
                    if c in ["synonyms", "nameSynonyms", "targetClass"] and len(ent[c])>0:
                        l.append("\t".join([item["label"] for item in ent[c]]))
                    elif c == "subcellularLocations" and len(ent[c])>0:
                        l.append("\t".join([item["location"] for item in ent[c]]))
                    elif c == "functionDescriptions" and len(ent[c])>0:
                        l.append(". ".join(ent[c]))
                    else:
                        l.append(ent[c])
                else:
                    l.append("")
            targets_lol.append(l)
                
df_targets = pd.DataFrame(targets_lol, columns=columns)
df_targets.to_csv("../data/ot/processed_raw_files/targets_processed.tsv", sep="\t", index=None, header=0)
df_targets


  0%|          | 0/200 [00:00<?, ?it/s]

Unnamed: 0,id,approvedSymbol,functionDescriptions,synonyms,nameSynonyms,subcellularLocations,targetClass,pathways
0,ENSG00000020219,CCT8L1P,Possible molecular chaperone; assists the fold...,Putative T-complex protein 1 subunit theta-lik...,Putative T-complex protein 1 subunit theta-lik...,Cytoplasm,,
1,ENSG00000059588,TARBP1,Probable S-adenosyl-L-methionine-dependent met...,Probable methyltransferase TARBP1\tTARBP1\tTRM...,Probable methyltransferase TARBP1\tTAR RNA-bin...,Nuclear speckles,Other nuclear protein,
2,ENSG00000070182,SPTB,Spectrin is the major constituent of the cytos...,"Spectrin beta chain, erythrocytic\tSPTB\tSPTB1...","Spectrin beta chain, erythrocytic\tBeta-I spec...",Cytoplasm\tCytosol,,"[{'pathwayId': 'R-HSA-6807878', 'pathway': 'CO..."
3,ENSG00000070366,SMG6,Component of the telomerase ribonucleoprotein ...,Telomerase-binding protein EST1A\tSMG6\tC17orf...,Telomerase-binding protein EST1A\tEver shorter...,Nucleus\tChromosome\tCytoplasm\tNucleoli\tCytosol,,"[{'pathwayId': 'R-HSA-975957', 'pathway': 'Non..."
4,ENSG00000072071,ADGRL1,Calcium-independent receptor of high affinity ...,Adhesion G protein-coupled receptor L1\tADGRL1...,Adhesion G protein-coupled receptor L1\tCalciu...,Cell membrane\tCell projection\tPresynaptic ce...,,
...,...,...,...,...,...,...,...,...
1995,ENSG00000070669,ASNS,[],Asparagine synthetase [glutamine-hydrolyzing]\...,Asparagine synthetase [glutamine-hydrolyzing]\...,Cytosol,Enzyme,"[{'pathwayId': 'R-HSA-8963693', 'pathway': 'As..."
1996,ENSG00000077327,SPAG6,Important for structural integrity of the cent...,Sperm-associated antigen 6\tSPAG6\tPF16\tRepro...,Sperm-associated antigen 6\tProtein PF16 homol...,Cytoplasm\tCell projection,,
1997,ENSG00000081248,CACNA1S,"Pore-forming, alpha-1S subunit of the voltage-...",Voltage-dependent L-type calcium channel subun...,Voltage-dependent L-type calcium channel subun...,Cell membrane,Auxiliary transport protein\tVoltage-gated cal...,"[{'pathwayId': 'R-HSA-419037', 'pathway': 'NCA..."
1998,ENSG00000085415,SEH1L,Component of the Nup107-160 subcomplex of the ...,Nucleoporin SEH1\tSEH1L\tSEC13L\tSEH1\tSEH1A\t...,Nucleoporin SEH1\tGATOR complex protein SEH1\t...,Chromosome\tNucleus\tLysosome membrane,,"[{'pathwayId': 'R-HSA-168325', 'pathway': 'Vir..."


# Disease

In [82]:
infolder_dis = "../data/ot/diseases/"
infiles_dis = get_input_files(infolder_dis)

columns = ["id", "approvedSymbol", "dbXRefs","description","name","synonyms", "directLocationIds", "therapeuticAreas","parents", "ancestors", "descendants", "children"]
synonyms = ["hasBroadSynonym", "hasExactSynonym", "hasNarrowSynonym", "hasRelatedSynonym"]
columns=columns[:4]+synonyms+columns[6:]

dis_lol = [] #list of lists

for infile in tqdm(infiles_dis):
    with open(infile, encoding="utf8") as f:
        for line in f.readlines()[:10]:
            ent = json.loads(line)
            l = []
            for c in columns:

                if c in ent:
#                     if c in ["synonyms", "nameSynonyms", "targetClass"] and len(ent[c])>0:
#                         l.append("\t".join([item["label"] for item in ent[c]]))
#                     elif c == "subcellularLocations" and len(ent[c])>0:
#                         l.append("\t".join([item["location"] for item in ent[c]]))
#                     elif c == "functionDescriptions" and len(ent[c])>0:
#                         l.append(". ".join(ent[c]))
                    if c == "synonyms" and len(ent[c])>0:
                        for s in synonyms:
                            if s in ent[c]:
                                l.append(ent[c][s])
                            else:
                                l.append("")
                    else:
                        l.append(ent[c])
                else:
                    l.append("")
            
            dis_lol.append(l)
                
df_dis = pd.DataFrame(dis_lol, columns=columns)
df_dis.to_csv("../data/ot/processed_raw_files/disease_processed.tsv", sep="\t", index=None, header=0)
df_dis

  0%|          | 0/200 [00:00<?, ?it/s]

Unnamed: 0,id,approvedSymbol,dbXRefs,description,hasBroadSynonym,hasExactSynonym,hasNarrowSynonym,hasRelatedSynonym,directLocationIds,therapeuticAreas,parents,ancestors,descendants,children
0,DOID_7551,,"[ICD9:098.89, MeSH:D006069, NCIt:C92950, DOID:...",A primary bacterial infectious disease that is...,,,,,,"[OTAR_0000017, EFO_0005741]","[EFO_0003955, MONDO_0000314]","[EFO_0000512, OTAR_0000017, EFO_0003955, EFO_0...",[],[]
1,EFO_0004254,,"[NCIt:C34645, NCIT:C34645, MeSH:D015433, MONDO...",A slowly progressive inflammation of the glome...,,,,,,[EFO_0009690],[MONDO_0002462],"[EFO_1002050, EFO_0009690, EFO_0003086, MONDO_...",[MONDO_0013860],[MONDO_0013860]
2,EFO_0005189,,[SNOMEDCT:74427007],The respiratory quotient (or RQ or respiratory...,,,,,,[EFO_0001444],[EFO_0005115],"[EFO_0001444, EFO_0005115]",[],[]
3,EFO_0005853,,[],short or long term physiological response of a...,,,,,,[GO_0008150],[GO_0050896],"[GO_0050896, GO_0008150]",[],[]
4,EFO_0006317,,[],Any process that results in a change in state ...,,,,,,[GO_0008150],[GO_0009410],"[GO_0050896, GO_0008150, GO_0009410]",[EFO_0007853],[EFO_0007853]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,EFO_0007307,,"[MeSH:D013283, SCTID:57920007, DOID:9059, MOND...",A Simplexvirus infectious disease that results...,,,,,,"[EFO_0005741, EFO_0010282]","[EFO_0009688, EFO_1002022]","[EFO_1002022, EFO_0009688, EFO_0000763, EFO_00...",[],[]
1996,EFO_0007422,,"[MESH:D010305, DOID:10302, MeSH:NoID, MONDO:00...",A disease involving the parotid gland.,,,,,,[EFO_0010282],[EFO_0008581],"[EFO_0008581, EFO_1001047, EFO_0010282]","[MONDO_0004700, EFO_1000459, EFO_1000461, EFO_...","[EFO_0003873, EFO_0007423]"
1997,EFO_0007763,,[],The determination of the amount of dihomo-gamm...,,,,,,[EFO_0001444],[EFO_0005680],"[EFO_0005680, EFO_0004529, EFO_0001444, EFO_00...",[],[]
1998,EFO_0007852,,[],quantification of the activity of the enzyme t...,,,,,,[EFO_0001444],[EFO_0004747],"[EFO_0004747, EFO_0001444]",[],[]


# Interactions