# Using the Prediction Model

## Environment

In [1]:
import getpass
import json
import os
import sys
import time

import pandas as pd
from tqdm import tqdm_notebook as tqdm

from seffnet.constants import (
    DEFAULT_EMBEDDINGS_PATH, DEFAULT_GRAPH_PATH,
    DEFAULT_MAPPING_PATH, DEFAULT_PREDICTIVE_MODEL_PATH,
    RESOURCES
)
from seffnet.literature import query_europe_pmc

In [2]:
print(sys.version)

3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]


In [3]:
print(time.asctime())

Fri Jan  3 12:12:03 2020


In [4]:
print(getpass.getuser())

aldis


# Loading the Data

In [5]:
from seffnet.default_predictor import predictor

In [6]:
print(f"""Loaded default predictor using paths:

embeddings: {DEFAULT_EMBEDDINGS_PATH}
graph:      {DEFAULT_GRAPH_PATH}
model:      {DEFAULT_PREDICTIVE_MODEL_PATH}
mapping:    {DEFAULT_MAPPING_PATH}
""")

Loaded default predictor using paths:

embeddings: c:\users\aldis\documents\github\seffnet\resources\embeddings\0812_weighted_node2vec_emb.embeddings
graph:      c:\users\aldis\documents\github\seffnet\resources\basic_graphs\fullgraph_with_chemsim.edgelist
model:      c:\users\aldis\documents\github\seffnet\resources\predictive_models\0812_weighted_node2vec_predictive_model.pkl
mapping:    c:\users\aldis\documents\github\seffnet\resources\mapping\fullgraph_nodes_mapping.tsv



# Examples of different kinds of predictions with literature evidence

## side effect - target association

In [7]:
r = predictor.find_new_relation(
    source_name='EGFR_HUMAN',
    target_name='Papulopustular rash',
)
print(json.dumps(r, indent=2))
#PMID: 18165622

{
  "source": {
    "node_id": "9587",
    "namespace": "uniprot",
    "identifier": "P00533",
    "name": "EGFR_HUMAN",
    "entity_type": "target"
  },
  "target": {
    "node_id": "6791",
    "namespace": "umls",
    "identifier": "C2609319",
    "name": "Papulopustular rash",
    "entity_type": "phenotype"
  },
  "lor": 0.62433
}


In [8]:
r = predictor.find_new_relation(
    source_id='9451', # Histamine receptor H1
    target_id='331', # Drowsiness
)
print(json.dumps(r, indent=2))
#PMID: 26626077

{
  "source": {
    "node_id": "9451",
    "namespace": "uniprot",
    "identifier": "P35367",
    "name": "HRH1_HUMAN",
    "entity_type": "target"
  },
  "target": {
    "node_id": "331",
    "namespace": "umls",
    "identifier": "C0013144",
    "name": "Drowsiness",
    "entity_type": "phenotype"
  },
  "lor": 0.08185
}


In [9]:
r = predictor.find_new_relation(
    source_id='9325', # SC6A2
    target_id='56', # Tachycardia
)
print(json.dumps(r, indent=2))
#PMID: 30952858

{
  "source": {
    "node_id": "9325",
    "namespace": "uniprot",
    "identifier": "P23975",
    "name": "SC6A2_HUMAN",
    "entity_type": "target"
  },
  "target": {
    "node_id": "56",
    "namespace": "umls",
    "identifier": "C0039231",
    "name": "Tachycardia",
    "entity_type": "phenotype"
  },
  "lor": 0.52279
}


In [10]:
r = predictor.find_new_relation(
    source_id='8670', # ACES_HUMAN
    target_id='309', # Bradycardia
)
print(json.dumps(r, indent=2))
#PMID: 30952858

{
  "source": {
    "node_id": "8670",
    "namespace": "uniprot",
    "identifier": "P22303",
    "name": "ACES_HUMAN",
    "entity_type": "target"
  },
  "target": {
    "node_id": "309",
    "namespace": "umls",
    "identifier": "C0428977",
    "name": "Bradycardia",
    "entity_type": "phenotype"
  },
  "lor": 0.85649
}


## drug- side effect association

In [11]:
r = predictor.find_new_relation(
    source_id='3534',  # diazepam
    target_id='670',  # Libido decreased
)
print(json.dumps(r, indent=2))
#PMID: 29888057

{
  "source": {
    "node_id": "3534",
    "namespace": "pubchem.compound",
    "identifier": "3016",
    "name": "Diazepam",
    "entity_type": "approved drug"
  },
  "target": {
    "node_id": "670",
    "namespace": "umls",
    "identifier": "C0011124",
    "name": "Libido decreased",
    "entity_type": "phenotype"
  },
  "lor": 0.00453
}


In [12]:
r = predictor.find_new_relation(
    source_id='1148',  # Cytarabine 
    target_id='1149',  # Anaemia megaloblastic
)
print(json.dumps(r, indent=2))
# PMID: 23157436

{
  "source": {
    "node_id": "1148",
    "namespace": "pubchem.compound",
    "identifier": "6175",
    "name": "Cytidine",
    "entity_type": "experimental drug"
  },
  "target": {
    "node_id": "1149",
    "namespace": "umls",
    "identifier": "C0002888",
    "name": "Anaemia megaloblastic",
    "entity_type": "phenotype"
  },
  "lor": 0.22571
}


## drug-target association

In [13]:
r = predictor.find_new_relation(
    source_id='14672',  # Sertindole 
    target_id='9350',   # CHRM1 receptor
)
print(json.dumps(r, indent=2))
# PMID: 29942259 

{
  "source": {
    "node_id": "14672",
    "namespace": "pubchem.compound",
    "identifier": "60149",
    "name": "Sertindole",
    "entity_type": "approved drug"
  },
  "target": {
    "node_id": "9350",
    "namespace": "uniprot",
    "identifier": "P11229",
    "name": "ACM1_HUMAN",
    "entity_type": "target"
  },
  "lor": 0.01543
}


# Example of predicting relations using node2vec model and embeddings

In [7]:
def get_predictions_df(curie, results_type=None):
    results = predictor.find_new_relations(
        node_curie=curie,
        results_type=results_type,
        k=50,
    )
    results_df = pd.DataFrame(results['predictions'])
    results_df = results_df[['node_id', 'namespace', 'identifier', 'name', 'lor', 'novel']]
    return results['query'], results_df

In [8]:
query, df = get_predictions_df('pubchem.compound:2159', 'phenotype')
print(json.dumps(query, indent=2))
df

{
  "entity": {
    "node_id": "2173",
    "namespace": "pubchem.compound",
    "identifier": "2159",
    "name": "Amisulpride",
    "entity_type": "approved drug"
  },
  "k": 50,
  "type": "phenotype"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,816,umls,C0232461,Increased appetite,6e-05,True
1,655,umls,C0006145,Breast disorder,9e-05,True
2,678,umls,C0012569,Diplopia,0.00019,True
3,1385,umls,C0015732,Faecal incontinence,0.00023,True
4,797,umls,C0338831,Mania,0.00028,True
5,351,umls,C0085633,Mood swings,0.00032,True
6,539,umls,C0042024,Urinary incontinence,0.00036,True
7,652,umls,C0853193,Bipolar I disorder,0.00036,True
8,1732,umls,C0037822,Speech disorder,0.00038,True
9,2876,umls,C0041747,Unintended pregnancy,0.00043,True


In [9]:
query, df = get_predictions_df('pubchem.compound:4585', 'phenotype')
print(json.dumps(query, indent=2))
df

{
  "entity": {
    "node_id": "4915",
    "namespace": "pubchem.compound",
    "identifier": "4585",
    "name": "Olanzapine",
    "entity_type": "approved drug"
  },
  "k": 50,
  "type": "phenotype"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,238,umls,C0344232,Vision blurred,0.00015,True
1,1391,umls,C0019270,Hernia,0.00017,True
2,1892,umls,C0041105,Trismus,0.00024,True
3,1156,umls,C0018991,Hemiplegia,0.00024,True
4,774,umls,C0043387,Yawning,0.00028,True
5,1822,umls,C0525045,Affective disorder,0.00032,True
6,2460,umls,C0162316,Iron deficiency anaemia,0.00038,True
7,538,umls,C0040435,Tooth disorder,0.00041,True
8,1029,umls,C0234238,Ache,0.00042,True
9,1657,umls,C0151754,Lactic dehydrogenase activity increased,0.00042,True


In [10]:
query, df = get_predictions_df('uniprot:P08172', 'phenotype')
print(json.dumps(query, indent=2))
df

{
  "entity": {
    "node_id": "9429",
    "namespace": "uniprot",
    "identifier": "P08172",
    "name": "ACM2_HUMAN",
    "entity_type": "target"
  },
  "k": 50,
  "type": "phenotype"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,302,umls,C1269683,Major depression,0.00162,True
1,2146,umls,C0234133,Extrapyramidal symptoms,0.00218,True
2,1141,umls,C3257803,Tearing eyes,0.00258,True
3,238,umls,C0344232,Vision blurred,0.00372,True
4,1062,umls,C0497327,Dementia,0.00374,True
5,2879,umls,C0026650,Movement disorder,0.00438,True
6,1836,umls,C0013415,Dysthymic disorder,0.00472,True
7,352,umls,C0233472,Affect lability,0.0064,True
8,817,umls,C0233414,Disturbance in attention,0.00675,True
9,816,umls,C0232461,Increased appetite,0.00678,True


In [11]:
query, df = get_predictions_df('uniprot:P08588', 'phenotype')
print(json.dumps(query, indent=2))
df

{
  "entity": {
    "node_id": "8733",
    "namespace": "uniprot",
    "identifier": "P08588",
    "name": "ADRB1_HUMAN",
    "entity_type": "target"
  },
  "k": 50,
  "type": "phenotype"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,4348,umls,C0232201,Sinus rhythm,0.00886,True
1,1952,umls,C0232197,Cardiac fibrillation,0.01189,True
2,93,umls,C0018799,Cardiac disorder,0.01216,True
3,206,umls,C0018790,Cardiac arrest,0.01395,True
4,3494,umls,C0028840,Ocular hypertension,0.01452,True
5,266,umls,C0018801,Cardiac failure,0.01567,True
6,354,umls,C0151636,Ventricular extrasystoles,0.01721,True
7,1714,umls,C0004239,Atrial flutter,0.0201,True
8,1325,umls,C0857121,Hypertensive,0.02139,True
9,238,umls,C0344232,Vision blurred,0.02171,True


In [12]:
query, df = get_predictions_df('uniprot:P22303', 'phenotype')
print(json.dumps(query, indent=2))
df

{
  "entity": {
    "node_id": "8670",
    "namespace": "uniprot",
    "identifier": "P22303",
    "name": "ACES_HUMAN",
    "entity_type": "target"
  },
  "k": 50,
  "type": "phenotype"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,3284,umls,C0151500,Anticholinergic syndrome,0.12628,True
1,294,umls,C0232487,Abdominal discomfort,0.14619,True
2,539,umls,C0042024,Urinary incontinence,0.18802,True
3,822,umls,C1306341,Mental disability,0.19613,True
4,894,umls,C0547030,Visual disturbance,0.20704,True
5,732,umls,C0027796,Neuralgia,0.21594,True
6,248,umls,C0236075,Menopausal symptoms,0.24063,True
7,987,umls,C0025323,Menorrhagia,0.25108,True
8,42,umls,C0037763,Muscle spasms,0.25592,True
9,2896,umls,C0085635,Photopsia,0.25932,True


In [13]:
query, df = get_predictions_df('uniprot:Q9UBN7', 'chemical')
print(json.dumps(query, indent=2))
df

{
  "entity": {
    "node_id": "12164",
    "namespace": "uniprot",
    "identifier": "Q9UBN7",
    "name": "HDAC6_HUMAN",
    "entity_type": "target"
  },
  "k": 50,
  "type": "chemical"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,5829,pubchem.compound,5978,Vincristine,0.0204,True
1,10931,pubchem.compound,440210,Bis(Adenosine)-5'-Pentaphosphate,0.03094,True
2,1465,pubchem.compound,1054,Pyridoxine,0.03736,True
3,17357,pubchem.compound,9865515,Mocetinostat,0.03816,True
4,6856,pubchem.compound,216326,Lenalidomide,0.03886,True
5,589,pubchem.compound,221493,Cholic Acid,0.04168,True
6,13540,pubchem.compound,656948,Phosphorylated Dihydropteroate,0.05937,True
7,1616,pubchem.compound,1690,AC1L1C0O,0.06055,True
8,6213,pubchem.compound,60668,ETOPOSIDE PHOSPHATE,0.06134,True
9,12447,pubchem.compound,445555,"2,5-Anhydroglucitol-1,6-Biphosphate",0.06145,True


In [8]:
query, df = get_predictions_df("umls:C0030567", 'chemical')
print(json.dumps(query, indent=2))
df

{
  "entity": {
    "node_id": "2248",
    "namespace": "umls",
    "identifier": "C0030567",
    "name": "Parkinson's disease",
    "entity_type": "phenotype"
  },
  "k": 50,
  "type": "chemical"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,5590,pubchem.compound,60648,Tiagabine,0.00087,True
1,3674,pubchem.compound,1150567,"1-BENZYL-4-[(5,6-DIMETHOXY-1-INDANON-2-YL)METH...",0.00104,True
2,6158,pubchem.compound,55480,AC1L1J67,0.00181,True
3,6417,pubchem.compound,175805,Ropivacaine,0.00183,True
4,4946,pubchem.compound,3034010,Orlistat,0.00204,True
5,3547,pubchem.compound,3032,"{2-[(2,6-dichlorophenyl)amino]phenyl}acetate",0.00229,True
6,7107,pubchem.compound,9838022,Tapentadol,0.00232,True
7,6656,pubchem.compound,129228,Rufinamide,0.00294,True
8,5931,pubchem.compound,16362,Pimozide,0.00313,True
9,6316,pubchem.compound,60853,138982-67-9,0.00358,True


In [9]:
results = []
for ind, row in df.iterrows():
    pmcid = []
    lit = query_europe_pmc(
        query_entity=row['name'],
        target_entities=[
            'umls:C0030567'
        ],
    )
    i = 0
    for x in lit:
        if i > 7:
            pmcid.append('... ect')
            lit.close()
            break
        pmcid.append(x['pmcid'])
        i+=1
    results.append((len(pmcid), pmcid))
df['co-occurance'] = results

In [10]:
df

Unnamed: 0,node_id,namespace,identifier,name,lor,novel,co-occurance
0,5590,pubchem.compound,60648,Tiagabine,0.00087,True,"(9, [PMC5318686, PMC6141625, PMC4931873, PMC34..."
1,3674,pubchem.compound,1150567,"1-BENZYL-4-[(5,6-DIMETHOXY-1-INDANON-2-YL)METH...",0.00104,True,"(0, [])"
2,6158,pubchem.compound,55480,AC1L1J67,0.00181,True,"(0, [])"
3,6417,pubchem.compound,175805,Ropivacaine,0.00183,True,"(9, [PMC2386465, PMC3172332, PMC3168353, PMC64..."
4,4946,pubchem.compound,3034010,Orlistat,0.00204,True,"(3, [PMC3283822, PMC3006492, PMC5274590])"
5,3547,pubchem.compound,3032,"{2-[(2,6-dichlorophenyl)amino]phenyl}acetate",0.00229,True,"(0, [])"
6,7107,pubchem.compound,9838022,Tapentadol,0.00232,True,"(1, [PMC6145352])"
7,6656,pubchem.compound,129228,Rufinamide,0.00294,True,"(0, [])"
8,5931,pubchem.compound,16362,Pimozide,0.00313,True,"(9, [PMC2772366, PMC2483942, PMC2947884, PMC65..."
9,6316,pubchem.compound,60853,138982-67-9,0.00358,True,"(0, [])"


In [13]:
df.to_csv(os.path.join(RESOURCES, 'parkinsons-chemicals.tsv'), sep='\t')

In [8]:
query, df = get_predictions_df('umls:C0242422', 'chemical')
print(json.dumps(query, indent=2))
df

{
  "entity": {
    "node_id": "852",
    "namespace": "umls",
    "identifier": "C0242422",
    "name": "Parkinsonism",
    "entity_type": "phenotype"
  },
  "k": 50,
  "type": "chemical"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,6105,pubchem.compound,5311128,Goserelin,0.00073,True
1,5617,pubchem.compound,443879,Tolterodine,0.00105,True
2,2573,pubchem.compound,644073,Buprenorphine,0.00141,True
3,6474,pubchem.compound,82146,Bexarotene,0.00185,True
4,2199,pubchem.compound,2170,Amoxapine,0.00188,True
5,3697,pubchem.compound,667468,Cidoxepin,0.00215,True
6,3547,pubchem.compound,3032,"{2-[(2,6-dichlorophenyl)amino]phenyl}acetate",0.00243,True
7,4506,pubchem.compound,4011,Maprotiline,0.00253,True
8,2044,pubchem.compound,2118,Alprazolam,0.00283,True
9,5101,pubchem.compound,5910,Pilocarpine,0.00296,True


In [31]:
query, df = get_predictions_df('pubchem.compound:5095', 'phenotype')
print(json.dumps(query, indent=2))
df
#PMID: 29241812

{
  "entity": {
    "node_id": "5346",
    "namespace": "pubchem.compound",
    "identifier": "5095",
    "name": "Ropinirole",
    "entity_type": "approved drug"
  },
  "k": 30,
  "type": "phenotype"
}


Unnamed: 0,node_id,namespace,identifier,name,lor,novel
0,291,umls,C0085605,Hepatic failure,0.00023,True
1,1139,umls,C0853034,Blood creatine phosphokinase increased,0.0003,True
2,1484,umls,C0014518,Toxic epidermal necrolysis,0.00037,True
3,763,umls,C0038325,Stevens-Johnson syndrome,0.00045,True
4,536,umls,C0037317,Sleep disturbance,0.00045,True
5,273,umls,C0026858,Musculoskeletal pain,0.00051,True
6,870,umls,C0424000,Suicidal ideation,0.00052,True
7,1317,umls,C0022658,Nephropathy,0.00057,True
8,862,umls,C0392156,Akathisia,0.00066,True
9,585,umls,C0687133,Drug interaction,0.00068,True


In [32]:
r = predictor.find_new_relation(
    source_id='2071', #Amantadine
    target_id='2248', #Parkinson's disease
)
print(json.dumps(r, indent=2))
#PMID: 21654146

{
  "source": {
    "node_id": "2071",
    "namespace": "pubchem.compound",
    "identifier": "2130",
    "name": "Amantadine",
    "entity_type": "approved drug"
  },
  "target": {
    "node_id": "2248",
    "namespace": "umls",
    "identifier": "C0030567",
    "name": "Parkinson's disease",
    "entity_type": "phenotype"
  },
  "lor": 0.00478
}


In [33]:
r = predictor.find_new_relation(
    source_id='5346', #Ropinirole
    target_id='1348', #Restless legs syndrome
)
print(json.dumps(r, indent=2))
#PMID: 21654146

{
  "source": {
    "node_id": "5346",
    "namespace": "pubchem.compound",
    "identifier": "5095",
    "name": "Ropinirole",
    "entity_type": "approved drug"
  },
  "target": {
    "node_id": "1348",
    "namespace": "umls",
    "identifier": "C0035258",
    "name": "Restless legs syndrome",
    "entity_type": "phenotype"
  },
  "lor": 0.00667
}


In [34]:
r = predictor.find_new_relation(
    source_id='3627', #Disulfiram
    target_id='2318', #Malignant melanoma
)
print(json.dumps(r, indent=2))
#PMID: 21654146

{
  "source": {
    "node_id": "3627",
    "namespace": "pubchem.compound",
    "identifier": "3117",
    "name": "Disulfiram",
    "entity_type": "approved drug"
  },
  "target": {
    "node_id": "2318",
    "namespace": "umls",
    "identifier": "C0025202",
    "name": "Malignant melanoma",
    "entity_type": "phenotype"
  },
  "lor": 0.51121
}


In [35]:
r = predictor.find_new_relation(
    source_id='17528', #Brigatinib
    target_id='5148', #Colorectal cancer
)
print(json.dumps(r, indent=2))
#PMID: 31410188

{
  "source": {
    "node_id": "17528",
    "namespace": "uniprot",
    "identifier": "Q99640",
    "name": "PMYT1_HUMAN",
    "entity_type": "target"
  },
  "target": {
    "node_id": "5148",
    "namespace": "umls",
    "identifier": "C1527249",
    "name": "Colorectal cancer",
    "entity_type": "phenotype"
  },
  "lor": 0.8214
}


In [36]:
r = predictor.find_new_relation(
    source_id='6995', #dasatinib
    target_id='1179', #Diffuse large B-cell lymphoma
)
print(json.dumps(r, indent=2))
#PMID: 31383760

{
  "source": {
    "node_id": "6995",
    "namespace": "pubchem.compound",
    "identifier": "3062316",
    "name": "Dasatinib",
    "entity_type": "approved drug"
  },
  "target": {
    "node_id": "1179",
    "namespace": "umls",
    "identifier": "C0079744",
    "name": "Diffuse large B-cell lymphoma",
    "entity_type": "phenotype"
  },
  "lor": 0.83577
}


In [37]:
r = predictor.find_new_relation(
    source_id='5265', #ribavirin
    target_id='947', #Candida infection
)
print(json.dumps(r, indent=2))
#PMID: 31307986

{
  "source": {
    "node_id": "5265",
    "namespace": "pubchem.compound",
    "identifier": "37542",
    "name": "Ribavirin",
    "entity_type": "approved drug"
  },
  "target": {
    "node_id": "947",
    "namespace": "umls",
    "identifier": "C0006840",
    "name": "Candida infection",
    "entity_type": "phenotype"
  },
  "lor": 0.12888
}
