In [1]:
# Define Neo4j connections
from neo4j import GraphDatabase
import pandas as pd
import numpy

In [2]:
host = 'bolt://localhost:7687'
user = 'neo4j'
password = '12345678'
database = 'dbms2'


In [3]:
driver = GraphDatabase.driver(host,auth=(user, password), database=database)
                                         

def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [4]:
data = run_query("""
MATCH (s)-[r]->(t)
RETURN toString(id(s)) as source, toString(id(t)) AS target, type(r) as type
""")

In [5]:
data.head()

Unnamed: 0,source,target,type
0,0,12590,interacts
1,0,8752,interacts
2,0,7915,interacts
3,0,21711,interacts
4,0,6447,interacts


In [6]:
from pykeen.triples import TriplesFactory


tf = TriplesFactory.from_labeled_triples(
  data[["source", "type", "target"]].values,
  create_inverse_triples=False,
  entity_to_id=None,
  relation_to_id=None,
  compact_id=False,
  filter_out_candidate_inverse_relations=True,
  metadata=None,
)

In [7]:
training, testing, validation = tf.split([.8, .1, .1])


using automatically assigned random_state=4156377184


# Running ComplEx,TransE and ConvE models

In [200]:
from pykeen.pipeline import pipeline

result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='ComplEx',
    stopper='early',
    epochs=3,
    dimensions=512,
    random_seed=420

)

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: C:\Users\shubh\.data\pykeen\checkpoints\best-model-weights-a46415ec-4c39-4630-8f2d-6a8645046df1.pt


Training epochs on cpu:   0%|          | 0/3 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/1759 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1759 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1759 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.


Evaluating on cpu:   0%|          | 0.00/56.3k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 847.32s seconds


In [9]:
result2 = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='TransE',
    stopper='early',
    epochs=3,
    dimensions=512,
    random_seed=420

)

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: C:\Users\shubh\.data\pykeen\checkpoints\best-model-weights-b07b9ea3-2fad-4879-87f1-de8a1c4d830a.pt


Training epochs on cpu:   0%|          | 0/3 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/1759 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1759 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1759 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.


Evaluating on cpu:   0%|          | 0.00/56.3k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 595.62s seconds


In [10]:
result3 = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='ConvE',
    stopper='early',
    epochs=3,
    dimensions=512,
    random_seed=420

)

INFO:pykeen.pipeline.api:Using device: None
The ConvE model should be trained with inverse triples.
This can be done by defining the TriplesFactory class with the _create_inverse_triples_ parameter set to true.
INFO:pykeen.nn.modules:Resolving None * None * None = 200.
INFO:pykeen.nn.modules:Resolved to 1 * 20 * 10 = 200.
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: C:\Users\shubh\.data\pykeen\checkpoints\best-model-weights-f6947946-1abf-4e66-9b43-e7984e2592ae.pt


Training epochs on cpu:   0%|          | 0/3 [00:00<?, ?epoch/s]

INFO:pykeen.training.training_loop:Dropping last (incomplete) batch each epoch (1/1758 (0.06%) batches).


Training batches on cpu:   0%|          | 0/1758 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1758 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1758 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.


Evaluating on cpu:   0%|          | 0.00/56.3k [00:00<?, ?triple/s]

RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 26449182720 bytes.

In [11]:
result.save_to_directory('result_complex')
result2.save_to_directory('result_transe')


INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172) to file:///C:/Users/shubh/Downloads/result_complex/training_triples
INFO:pykeen.pipeline.api:Saved to directory: file:///C:/Users/shubh/Downloads/result_complex
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172) to file:///C:/Users/shubh/Downloads/result_transe/training_triples
INFO:pykeen.pipeline.api:Saved to directory: file:///C:/Users/shubh/Downloads/result_transe


In [16]:
from pykeen.predict import predict_target

# USECASE 1 : Upregulated genes prediction for Breast Cancer

In [272]:
disease_id = run_query("""
MATCH (s:Disease)
WHERE s.name = "breast cancer"
RETURN toString(id(s)) as id
""")['id'][0]



In [273]:
disease_id

'3099'

### Analysis using ComplEx model

In [274]:
pred = predict_target(
    model=result.model,
    head=disease_id,
    relation="upregulates",
    triples_factory=result.training,
)

In [275]:
pred

TargetPredictions(df=       tail_id      score tail_label
2758      2758  59.933502      12834
18403    18403  59.775108       8448
10648    10648  56.862106      20901
14218    14218  55.041542       4178
8279      8279  54.862183       1849
...        ...        ...        ...
10300    10300 -58.386963      20547
2636      2636 -58.652962      12707
4389      4389 -59.824844        145
3433      3433 -61.008022      13535
5323      5323 -61.487949      15467

[19930 rows x 3 columns], factory=TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172), target='tail', other_columns_fixed_ids=(13167, 9))

In [276]:
pred_filtered = pred.filter_triples(training)
pred_filtered

TargetPredictions(df=       tail_id      score tail_label
2758      2758  59.933502      12834
18403    18403  59.775108       8448
10648    10648  56.862106      20901
14218    14218  55.041542       4178
8279      8279  54.862183       1849
...        ...        ...        ...
10300    10300 -58.386963      20547
2636      2636 -58.652962      12707
4389      4389 -59.824844        145
3433      3433 -61.008022      13535
5323      5323 -61.487949      15467

[19724 rows x 3 columns], factory=TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172), target='tail', other_columns_fixed_ids=(13167, 9))

In [277]:
pred_annotated = pred_filtered.add_membership_columns(training=training, validation=validation, testing=testing)
pred_annotated

TargetPredictions(df=       tail_id      score tail_label  in_training  in_validation  in_testing
2758      2758  59.933502      12834        False          False       False
18403    18403  59.775108       8448        False          False       False
10648    10648  56.862106      20901        False          False       False
14218    14218  55.041542       4178        False          False       False
8279      8279  54.862183       1849        False          False       False
...        ...        ...        ...          ...            ...         ...
10300    10300 -58.386963      20547        False          False       False
2636      2636 -58.652962      12707        False          False       False
4389      4389 -59.824844        145        False          False       False
3433      3433 -61.008022      13535        False          False       False
5323      5323 -61.487949      15467        False          False       False

[19724 rows x 6 columns], factory=TriplesFactory(num_e

In [279]:
df = pred_annotated.df
df

Unnamed: 0,tail_id,score,tail_label,in_training,in_validation,in_testing
2758,2758,59.933502,12834,False,False,False
18403,18403,59.775108,8448,False,False,False
10648,10648,56.862106,20901,False,False,False
14218,14218,55.041542,4178,False,False,False
8279,8279,54.862183,1849,False,False,False
...,...,...,...,...,...,...
10300,10300,-58.386963,20547,False,False,False
2636,2636,-58.652962,12707,False,False,False
4389,4389,-59.824844,145,False,False,False
3433,3433,-61.008022,13535,False,False,False


In [280]:
candidate_nodes = df[df['in_training'] == False].head(50)['tail_label'].to_list()
print(candidate_nodes)

run_query("""
MATCH (n)
WHERE id(n) = toInteger($disease_id)
UNWIND $candidates as ca
MATCH (c)
WHERE id(c) = toInteger(ca)
MERGE (n)-[:upregulates_C1]->(c)
""", {'disease_id':disease_id, 'candidates': candidate_nodes})


['12834', '8448', '20901', '4178', '1849', '20929', '3750', '14295', '6213', '5033', '10884', '16716', '7467', '22414', '1553', '13515', '13850', '47', '19972', '13176', '19846', '6672', '14914', '6098', '16141', '10331', '17094', '16839', '5353', '7931', '8264', '655', '10953', '17914', '10828', '7491', '16488', '10506', '11053', '1241', '11333', '13751', '19393', '8340', '15008', '4252', '19739', '7688', '22252', '12429']


In [281]:
run_query("""
MATCH (c:Disease)-[:upregulates_C1]->(d:Gene)
RETURN c.name as Disease, d.name as Gene
""")

Unnamed: 0,Disease,Gene
0,breast cancer,ZBTB22
1,breast cancer,COL11A2
2,breast cancer,TBX19
3,breast cancer,GRPEL2
4,breast cancer,OR51E1
5,breast cancer,SEC23A
6,breast cancer,PRG3
7,breast cancer,SMOX
8,breast cancer,IGHMBP2
9,breast cancer,PPM1A


In [282]:
run_query("""
MATCH (c:Disease {name: "breast cancer"}),(d:Gene {name:"ZBTB22"})
WITH c,d
MATCH p=AllShortestPaths((c)-[r:binds|regulates|interacts|upregulates|downregulates|associates*1..4]-(d))
RETURN [n in nodes(p) | n.name] LIMIT 25
""")

Unnamed: 0,[n in nodes(p) | n.name]
0,"[breast cancer, PIN1, ZBTB22]"


In [288]:
run_query("""
MATCH (c:Disease {name: "breast cancer"}),(d:Gene {name:"COL11A2"})
WITH c,d
MATCH p=AllShortestPaths((c)-[r:binds|regulates|interacts|upregulates|downregulates|associates*1..4]-(d))
RETURN [n in nodes(p) | n.name] LIMIT 25
""")

Unnamed: 0,[n in nodes(p) | n.name]
0,"[breast cancer, HSP90AA1, DDR2, COL11A2]"
1,"[breast cancer, SRC, DDR2, COL11A2]"


In [289]:
run_query("""
MATCH (c:Disease {name: "breast cancer"}),(d:Gene {name:"TBX19"})
WITH c,d
MATCH p=AllShortestPaths((c)-[r:binds|regulates|interacts|upregulates|downregulates|associates*1..4]-(d))
RETURN [n in nodes(p) | n.name] LIMIT 25
""")

Unnamed: 0,[n in nodes(p) | n.name]
0,"[breast cancer, AR, NR5A1, TBX19]"
1,"[breast cancer, BARD1, NR5A1, TBX19]"
2,"[breast cancer, NCOA1, NR5A1, TBX19]"
3,"[breast cancer, NRIP1, NR5A1, TBX19]"
4,"[breast cancer, TRERF1, NR5A1, TBX19]"
5,"[breast cancer, JUN, NR5A1, TBX19]"
6,"[breast cancer, CTNNB1, NR5A1, TBX19]"


### Analysis using TransE model

In [290]:
pred2 = predict_target(
    model=result2.model,
    head=disease_id,
    relation="upregulates",
    triples_factory=result2.training,
)

In [291]:
pred2

TargetPredictions(df=       tail_id      score tail_label
4236      4236  -7.233153      14346
8449      8449  -7.405909      18659
11541    11541  -7.448671      21807
10104    10104  -7.582721      20354
17244    17244  -7.628639       7273
...        ...        ...        ...
14747    14747 -14.162211       4719
6959      6959 -14.169616      17133
12908    12908 -14.182336        283
12123    12123 -14.255861      22395
19079    19079 -14.690176        913

[19930 rows x 3 columns], factory=TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172), target='tail', other_columns_fixed_ids=(13167, 9))

In [292]:
pred_filtered2 = pred2.filter_triples(training)
pred_filtered2

TargetPredictions(df=       tail_id      score tail_label
4236      4236  -7.233153      14346
8449      8449  -7.405909      18659
11541    11541  -7.448671      21807
10104    10104  -7.582721      20354
17244    17244  -7.628639       7273
...        ...        ...        ...
14747    14747 -14.162211       4719
6959      6959 -14.169616      17133
12908    12908 -14.182336        283
12123    12123 -14.255861      22395
19079    19079 -14.690176        913

[19724 rows x 3 columns], factory=TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172), target='tail', other_columns_fixed_ids=(13167, 9))

In [293]:
pred_annotated2 = pred_filtered2.add_membership_columns(training=training, validation=validation, testing=testing)
pred_annotated2

TargetPredictions(df=       tail_id      score tail_label  in_training  in_validation  in_testing
4236      4236  -7.233153      14346        False          False       False
8449      8449  -7.405909      18659        False          False       False
11541    11541  -7.448671      21807        False          False       False
10104    10104  -7.582721      20354        False          False       False
17244    17244  -7.628639       7273        False          False       False
...        ...        ...        ...          ...            ...         ...
14747    14747 -14.162211       4719        False          False       False
6959      6959 -14.169616      17133        False          False       False
12908    12908 -14.182336        283        False          False       False
12123    12123 -14.255861      22395        False          False       False
19079    19079 -14.690176        913        False          False       False

[19724 rows x 6 columns], factory=TriplesFactory(num_e

In [294]:
df2 = pred_annotated2.df

candidate_nodes2 = df2[df2['in_training'] == False].head(50)['tail_label'].to_list()

run_query("""
MATCH (n)
WHERE id(n) = toInteger($disease_id)
UNWIND $candidates as ca
MATCH (c)
WHERE id(c) = toInteger(ca)
MERGE (n)-[:upregulates_T1]->(c)
""", {'disease_id':disease_id, 'candidates': candidate_nodes2})


In [295]:
run_query("""
MATCH (c:Disease)-[:upregulates_T1]->(d:Gene)
RETURN c.name as Disease, d.name as Gene
""")

Unnamed: 0,Disease,Gene
0,breast cancer,TSC22D3
1,breast cancer,LBR
2,breast cancer,SLC35B1
3,breast cancer,STUB1
4,breast cancer,DYNLT3
5,breast cancer,HACD3
6,breast cancer,ANKRA2
7,breast cancer,PPP4C
8,breast cancer,TRIB3
9,breast cancer,CLPX


In [297]:
run_query("""
MATCH (c:Disease {name: "breast cancer"}),(d:Gene {name:"STUB1"})
WITH c,d
MATCH p=AllShortestPaths((c)-[r:binds|regulates|interacts|upregulates|downregulates|associates*1..4]-(d))
RETURN [n in nodes(p) | n.name] LIMIT 25
""")

Unnamed: 0,[n in nodes(p) | n.name]
0,"[breast cancer, GPI, STUB1]"
1,"[breast cancer, HSPA1B, STUB1]"
2,"[breast cancer, KCNN4, STUB1]"
3,"[breast cancer, TFRC, STUB1]"
4,"[breast cancer, HIST1H2BK, STUB1]"
5,"[breast cancer, FOXQ1, STUB1]"
6,"[breast cancer, KRT8, STUB1]"
7,"[breast cancer, AHR, STUB1]"
8,"[breast cancer, SPRY1, STUB1]"
9,"[breast cancer, RARB, STUB1]"


In [298]:
run_query("""
MATCH (c:Disease {name: "breast cancer"}),(d:Gene {name:"DYNLT3"})
WITH c,d
MATCH p=AllShortestPaths((c)-[r:binds|regulates|interacts|upregulates|downregulates|associates*1..4]-(d))
RETURN [n in nodes(p) | n.name] LIMIT 25
""")

Unnamed: 0,[n in nodes(p) | n.name]
0,"[breast cancer, KIF2C, DYNLT3]"
1,"[breast cancer, CCNB2, DYNLT3]"
2,"[breast cancer, GLI2, DYNLT3]"
3,"[breast cancer, RAD54L, DYNLT3]"
4,"[breast cancer, CCT5, DYNLT3]"
5,"[breast cancer, PCNA, DYNLT3]"
6,"[breast cancer, CDKN1B, DYNLT3]"
7,"[breast cancer, LDHB, DYNLT3]"
8,"[breast cancer, AURKA, DYNLT3]"
9,"[breast cancer, CDC20, DYNLT3]"


# USECASE 2 : Finding compound for one of the genes found in usecase 1

In [308]:
gene_id = run_query("""
MATCH (s:Gene)
WHERE s.name = "SMC4"
RETURN toString(id(s)) as id
""")['id'][0]

gene_id

'345'

### Analysis using Complex model

In [309]:
pred_d = predict_target(
    model=result.model,
    tail=gene_id,
    relation="downregulates",
    triples_factory=result.training,
)


In [310]:
pred_d

TargetPredictions(df=       head_id      score head_label
15772    15772  58.163986        577
17108    17108  54.876755       7141
17098    17098  54.188519        713
7173      7173  50.922840      17348
17709    17709  50.907528        775
...        ...        ...        ...
12028    12028 -50.278778      22303
17948    17948 -51.419437        799
4936      4936 -53.656712      15067
19476    19476 -53.952469       9538
9131      9131 -58.113907      19362

[19930 rows x 3 columns], factory=TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172), target='head', other_columns_fixed_ids=(4, 13507))

In [311]:
pred_filtered_d = pred_d.filter_triples(training)
pred_filtered_d

TargetPredictions(df=       head_id      score head_label
15772    15772  58.163986        577
17108    17108  54.876755       7141
17098    17098  54.188519        713
7173      7173  50.922840      17348
17709    17709  50.907528        775
...        ...        ...        ...
12028    12028 -50.278778      22303
17948    17948 -51.419437        799
4936      4936 -53.656712      15067
19476    19476 -53.952469       9538
9131      9131 -58.113907      19362

[19903 rows x 3 columns], factory=TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172), target='head', other_columns_fixed_ids=(4, 13507))

In [312]:
pred_annotated_d = pred_filtered_d.add_membership_columns(training=training, validation=validation, testing=testing)
pred_annotated_d

TargetPredictions(df=       head_id      score head_label  in_training  in_validation  in_testing
15772    15772  58.163986        577        False          False       False
17108    17108  54.876755       7141        False          False       False
17098    17098  54.188519        713        False          False       False
7173      7173  50.922840      17348        False          False       False
17709    17709  50.907528        775        False          False       False
...        ...        ...        ...          ...            ...         ...
12028    12028 -50.278778      22303        False          False       False
17948    17948 -51.419437        799        False          False       False
4936      4936 -53.656712      15067        False          False       False
19476    19476 -53.952469       9538        False          False       False
9131      9131 -58.113907      19362        False          False       False

[19903 rows x 6 columns], factory=TriplesFactory(num_e

In [313]:
df_d = pred_annotated_d.df

candidate_nodes_d = df_d[df_d['in_training'] == False].head(50)['head_label'].to_list()

candidate_nodes_d


['577',
 '7141',
 '713',
 '17348',
 '775',
 '11449',
 '12071',
 '17355',
 '5193',
 '9257',
 '11379',
 '21727',
 '10982',
 '16427',
 '917',
 '15790',
 '8973',
 '22400',
 '18623',
 '1888',
 '18026',
 '16346',
 '13416',
 '1754',
 '18365',
 '10434',
 '6207',
 '8783',
 '13802',
 '21577',
 '898',
 '2921',
 '16026',
 '888',
 '2907',
 '2977',
 '4478',
 '10619',
 '17246',
 '22502',
 '12141',
 '9284',
 '8336',
 '6889',
 '16525',
 '10784',
 '21113',
 '20699',
 '16516',
 '2461']

In [314]:
run_query("""
MATCH (n)
WHERE id(n) = toInteger($gene_id)
UNWIND $candidates as ca
MATCH (c)
WHERE id(c) = toInteger(ca) and "Compound" in labels(c)
MERGE (c)-[:downreg_c_usecase2]->(n)
""", {'gene_id':gene_id, 'candidates': candidate_nodes_d})

In [315]:
run_query("""
MATCH (c:Compound)-[:downreg_c_usecase2]->(d:Gene)
RETURN c.name as Compound, d.name as Gene
""")

Unnamed: 0,Compound,Gene
0,Cyclothiazide,SMC4
1,Prednisone,SMC4
2,Thiotepa,SMC4


### Analysis using TransE model

In [316]:
pred_d = predict_target(
    model=result2.model,
    tail=gene_id,
    relation="downregulates",
    triples_factory=result2.training,
)
pred_d

TargetPredictions(df=       head_id      score head_label
4040      4040  -6.987144      14149
7709      7709  -7.176988      17897
14711    14711  -7.318759       4685
285        285  -7.319709      10290
2539      2539  -7.341794      12608
...        ...        ...        ...
12490    12490 -14.326984       2395
12025    12025 -14.328463       2230
13991    13991 -14.413631       3949
16919    16919 -14.427435       6947
17496    17496 -14.738400       7532

[19930 rows x 3 columns], factory=TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172), target='head', other_columns_fixed_ids=(4, 13507))

In [317]:
pred_filtered_d = pred_d.filter_triples(training)
pred_filtered_d

TargetPredictions(df=       head_id      score head_label
4040      4040  -6.987144      14149
7709      7709  -7.176988      17897
2539      2539  -7.341794      12608
754        754  -7.447429      10779
11483    11483  -7.452220      21750
...        ...        ...        ...
12490    12490 -14.326984       2395
12025    12025 -14.328463       2230
13991    13991 -14.413631       3949
16919    16919 -14.427435       6947
17496    17496 -14.738400       7532

[19903 rows x 3 columns], factory=TriplesFactory(num_entities=19930, num_relations=10, create_inverse_triples=False, num_triples=450172), target='head', other_columns_fixed_ids=(4, 13507))

In [318]:
pred_annotated_d = pred_filtered_d.add_membership_columns(training=training, validation=validation, testing=testing)
pred_annotated_d

TargetPredictions(df=       head_id      score head_label  in_training  in_validation  in_testing
4040      4040  -6.987144      14149        False          False       False
7709      7709  -7.176988      17897        False          False       False
2539      2539  -7.341794      12608        False          False        True
754        754  -7.447429      10779        False          False       False
11483    11483  -7.452220      21750        False          False       False
...        ...        ...        ...          ...            ...         ...
12490    12490 -14.326984       2395        False          False       False
12025    12025 -14.328463       2230        False          False       False
13991    13991 -14.413631       3949        False          False       False
16919    16919 -14.427435       6947        False          False       False
17496    17496 -14.738400       7532        False          False       False

[19903 rows x 6 columns], factory=TriplesFactory(num_e

In [321]:
df_d = pred_annotated_d.df

candidate_nodes_d = df_d[df_d['in_training'] == False].head(50)['head_label'].to_list()

candidate_nodes_d


['14149',
 '17897',
 '12608',
 '10779',
 '21750',
 '18131',
 '9349',
 '17133',
 '13726',
 '16565',
 '11388',
 '1278',
 '5972',
 '913',
 '22096',
 '3099',
 '6832',
 '19130',
 '11213',
 '283',
 '17046',
 '13667',
 '19473',
 '1796',
 '20343',
 '9052',
 '15522',
 '11912',
 '14874',
 '10838',
 '322',
 '10808',
 '7714',
 '15833',
 '5070',
 '4407',
 '13047',
 '18634',
 '9825',
 '19270',
 '6113',
 '6216',
 '1303',
 '22395',
 '4360',
 '1595',
 '10144',
 '4437',
 '9836',
 '20460']

In [322]:
run_query("""
MATCH (n)
WHERE id(n) = toInteger($gene_id)
UNWIND $candidates as ca
MATCH (c)
WHERE id(c) = toInteger(ca) and "Compound" in labels(c)
MERGE (c)-[:downreg_t_usecase2]->(n)
""", {'gene_id':gene_id, 'candidates': candidate_nodes_d})

In [323]:
run_query("""
MATCH (c:Compound)-[:downreg_t_usecase2]->(d:Gene)
RETURN c.name as Compound, d.name as Gene
""")

Unnamed: 0,Compound,Gene
0,Vemurafenib,SMC4
1,Pitavastatin,SMC4
2,Azacitidine,SMC4
3,Digitoxin,SMC4
4,Bisacodyl,SMC4
5,Clofarabine,SMC4
6,Digoxin,SMC4
7,Thioridazine,SMC4
8,Treprostinil,SMC4
9,Sirolimus,SMC4


In [325]:
run_query("""
MATCH (c:Gene {name: "SMC4"}),(d:Compound {name:"Vemurafenib"})
WITH c,d
MATCH p=AllShortestPaths((c)-[r:binds|regulates|interacts|upregulates|downregulates|associates*1..4]-(d))
RETURN [n in nodes(p) | n.name] LIMIT 25
""")

Unnamed: 0,[n in nodes(p) | n.name]
0,"[SMC4, SLC2A6, Vemurafenib]"
1,"[SMC4, INPP4B, Vemurafenib]"
2,"[SMC4, CREG1, Vemurafenib]"
3,"[SMC4, UGDH, Vemurafenib]"
4,"[SMC4, NBR1, Vemurafenib]"
5,"[SMC4, CBS, Vemurafenib]"
6,"[SMC4, PAN2, Vemurafenib]"
7,"[SMC4, HSPA8, Vemurafenib]"
8,"[SMC4, TUBB3, Vemurafenib]"
9,"[SMC4, KAT6B, Vemurafenib]"
