In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import os
import sys
import json
import torch
import pickle
import itertools

import networkx as nx
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt 
from tqdm import tqdm
from collections import Counter 
from copy import deepcopy
from torch.utils.data import TensorDataset

sys.path.insert(0, "../")

## Load PlaNet knowledge graph

In [5]:
import knowledge_graph
from knowledge_graph.kg import *

In [6]:
def load_kg_vocab():
    kgid2x = {}
    x2kgid = {}
    for line in open('../data/graph/entities.dict'):
        x, kgid = line.split()
        kgid2x[kgid] = int(x)
        x2kgid[int(x)] = kgid
    relname2etype = {}
    etype2relname = {}
    for line in open('../data/graph/relations.dict'):
        etype, name = line.split()
        name = name[name.find('rel-name-'):][len('rel-name-'):]
        relname2etype[name] = int(etype)
        etype2relname[int(etype)] = name 
    return kgid2x, x2kgid, relname2etype, etype2relname

kgid2x, x2kgid, relname2etype, etype2relname = load_kg_vocab()

## Run PlaNet models for new clinical trial

In [7]:
from utils.demo_utils import load_model, load_model_and_data, model_inference, prepare_runner
from gcn_models.utils import set_seed
from gcn_models.evaluator import Evaluator
set_seed(24)



In [9]:
#Load BERT embedder
from utils.text_bert_features import TextBertFeatures
bert_model = TextBertFeatures(
    bert_model='microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext',
    device='cuda:1'
)

### Load the new clinical trial data

In [10]:
new_trial_data = pkl.load(open('small_data/trial_data_NCT02370680.pkl', 'rb'))

In [11]:
new_trial_1, new_trial_2 = new_trial_data
new_trial_1, new_trial_2

({'nct_id': 'NCT02370680',
  'arm_label': 'durlaza™, 1 capsule',
  'arm_idx': 0,
  'trial_arm_edges': [{'kg_id': 'KG00000863',
    'relation': 'study-disease',
    'key': Relation(name='study-disease', source=<Source.CLINICAL_TRIAL: (9,)>),
    'data': {'relation': 'study-disease',
     'source': <Source.CLINICAL_TRIAL: (9,)>,
     'attrs': {}}},
   {'kg_id': 'KG00122148',
    'relation': 'primary_outcome',
    'key': Relation(name='primary_outcome', source=<Source.CLINICAL_TRIAL: (9,)>),
    'data': {'relation': 'primary_outcome',
     'source': <Source.CLINICAL_TRIAL: (9,)>,
     'attrs': {}}},
   {'kg_id': 'KG00020807',
    'relation': 'arm_tests_drug',
    'key': Relation(name='arm_tests_drug', source=<Source.CLINICAL_TRIAL: (9,)>),
    'data': {'relation': 'arm_tests_drug',
     'source': <Source.CLINICAL_TRIAL: (9,)>,
     'attrs': {}}},
   {'kg_id': 'KG00096684',
    'relation': 'eligibility-exclusion',
    'key': Relation(name='eligibility-exclusion', source=<Source.CLINICAL_TR

#### Get the feature of the new trials

In [13]:
def get_trial_feature(new_trial):
    _new_emb = bert_model._embed(new_trial['arm_text'])
    return np.concatenate([_new_emb, new_trial['trial_attribute_feats_vec']])

new_emb_1 = get_trial_feature(new_trial_1)
new_emb_2 = get_trial_feature(new_trial_2)
print(new_emb_1.shape, new_emb_2.shape)

(786,) (786,)


#### Get the KG info of the new trials

In [14]:
def get_new_edges(new_trial, the_x):
    new_edges = []; new_etypes = []; seen = set()
    for edge in new_trial_1['trial_arm_edges']:
        h = the_x
        t = kgid2x[edge['kg_id']]
        r = relname2etype[edge['relation']]
        new_edges.append([h,t])
        new_etypes.append(r)
        new_edges.append([t,h])
        new_etypes.append(r+26)
        seen.add(r)
    for r in [21,22,23,24,25]:
        if r not in seen:
            new_edges.append([the_x,0])
            new_etypes.append(r)
            new_edges.append([0,the_x])
            new_etypes.append(r+26)
    new_edges = torch.tensor(new_edges).t()
    new_etypes = torch.tensor(new_etypes)
    print(new_edges.shape)
    print(new_etypes.shape)
    return new_edges, new_etypes

### Predict for the AE task

#### Load the AE data and model

In [15]:
aeidx2kgid = pkl.load(open('small_data/ae1017_idx2aename.pkl', 'rb'))
aekgid2name = pkl.load(open('small_data/ae_kgid2name.pkl', 'rb'))
aeidx2name = [aekgid2name[kgid] for kgid in aeidx2kgid]

def predict_positive_ae(pred, threshold):
    #pred: a list of length 1017
    out = []
    for idx, val in enumerate(pred):
        if val > threshold:
            out.append(aeidx2name[idx])
    return out

In [16]:
(ae_dataset, ae_tasks), \
    ae_encoder, ae_bert_encoder, \
    ae_model, ae_args, ae_runner \
    = load_model_and_data('../data/models/ae_model_shxo9bgw/ckpt.pt')


args.data_path ../data/graph
Adding infer nodes and edges
Reading train triples....
False
Finished. Read 3388945 train triples.
Reading valid triples....
False
Finished. Read 188264 valid triples.
Reading test triples....
False
Finished. Read 188297 test triples.
(205809, 786)
TRIAL_ARM          205809
PROTEIN             15787
DRUG                14299
UMLS                 7651
DISEASE              5751
PRIMARY_OUTCOME      3048
Name: etype, dtype: int64

Final:
TRIAL_ARM          86899
PROTEIN            15787
DRUG               12133
UMLS                7645
DISEASE             5751
PRIMARY_OUTCOME     2960
Name: etype, dtype: int64
running _load_graph..
running _load_trial_features..
         trial        kgid
0  NCT00000134  KG00251195
1  NCT00000134  KG00251197
2  NCT00000134  KG00251196
3  NCT00000136  KG00249524
4  NCT00000142  KG00250283
Total AE idx: 1
*****[NOTE] Using OR threshold 2 *****
Number of dropped examples:  3211
Task: ae_clf_or, Subtasks: 1017
task_y.shape torch.S

0it [00:00, ?it/s]

batch_size: 128
gradient_accumulation_steps (for training): 1
effective batch_size (for training): 128
layer_sizes [-1, -1, -1]


#### Add the new trial to the inference dataset

In [17]:
ae_dataset_bk = deepcopy(ae_dataset)
ae_tasks_bk = deepcopy(ae_tasks)
ae_args_bk = deepcopy(ae_args)

In [20]:
def add_new_trial_to_ae_dataset(dataset):
    # Update the df
    dataset.df = dataset.df[dataset.df['split']=='test'].head(1)
    the_x = dataset.df.iloc[0]['x']
    the_kgid = dataset.df.iloc[0]['kgid']
    print('the_x', the_x, 'the_kgid', the_kgid)
    
    # Update the node features
    _node_feats = deepcopy(dataset.node_feats)
    pos = _node_feats[_node_feats['node_id'] == the_kgid].index[0]
    _node_feats.at[pos, 'emb'] = new_emb_1
    dataset.node_feats = _node_feats
    
    # Update the graph edges
    _graph = deepcopy(dataset.graph)
    new_edges, new_etypes = get_new_edges(new_trial_1, the_x)
    
    _edge_index = _graph.data.edge_index.clone()
    columns_with_the_x = _edge_index.eq(the_x).any(dim=0)
    _edge_index = _edge_index[:, ~columns_with_the_x]
    _edge_index = torch.cat([_edge_index, new_edges], dim=1)
    _graph.data.edge_index = _edge_index
    
    _edge_type = _graph.data.edge_type.clone()
    _edge_type = _edge_type[~columns_with_the_x]
    _edge_type = torch.cat([_edge_type, new_etypes], dim=0)
    _graph.data.edge_type = _edge_type
    
    dataset.graph = _graph
    
    # Make the dataset ready for inference
    x = dataset._get_data_x(dataset.df)
    print('x', x)
    tensors = [x]
    tensors.extend([dataset.task_ys[0][0].repeat(len(x), 1)])
    tensors.extend([dataset.sample_weight_masks[0][0].repeat(len(x), 1)])
    dataset.datasets['test'] = TensorDataset(*tensors)
    
    return dataset

#### Run inference for the new trial

In [21]:
ae_dataset = add_new_trial_to_ae_dataset(ae_dataset)
ae_args, ae_runner, ae_encoder \
    = prepare_runner(ae_args, ae_dataset, ae_encoder, ae_bert_encoder, ae_model)

_, y_test_pred, _ = model_inference(ae_runner, mode='test') 
print ('y_test_pred.size()', y_test_pred.size())

the_x 98231 the_kgid KG00136002
torch.Size([2, 60])
torch.Size([60])
x tensor([98231])
self.fixed_encoder False
self.args.bert_unfreeze_epoch 1
to_optimze [{'params': <generator object Module.parameters at 0x7f316d671ac0>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f316d671b30>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f316d671cf0>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f316d671d60>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f316d671dd0>, 'lr': 0}]




0it [00:00, ?it/s][A[A

batch_size: 128
gradient_accumulation_steps (for training): 1
effective batch_size (for training): 128
layer_sizes [-1, -1, -1]


0it [08:29, ?it/s]


#### Predicted AEs:

In [23]:
print(predict_positive_ae(y_test_pred[0], threshold=0.05))

['diarrhoea']


### Predict for the Safety task

#### Load the safety data and model

In [18]:
(sf_dataset, sf_tasks), \
    sf_encoder, sf_bert_encoder, \
    sf_model, sf_args, sf_runner \
    = load_model_and_data('../data/models/safety_model_1xekl810/ckpt.pt')

args.data_path ../data/graph
Adding infer nodes and edges
Reading train triples....
False
Finished. Read 3388945 train triples.
Reading valid triples....
False
Finished. Read 188264 valid triples.
Reading test triples....
False
Finished. Read 188297 test triples.
(205809, 786)
TRIAL_ARM          205809
PROTEIN             15787
DRUG                14299
UMLS                 7651
DISEASE              5751
PRIMARY_OUTCOME      3048
Name: etype, dtype: int64

Final:
TRIAL_ARM          86899
PROTEIN            15787
DRUG               12133
UMLS                7645
DISEASE             5751
PRIMARY_OUTCOME     2960
Name: etype, dtype: int64
running _load_graph..
running _load_trial_features..
         trial        kgid
0  NCT00000134  KG00251195
1  NCT00000134  KG00251197
2  NCT00000134  KG00251196
3  NCT00000136  KG00249524
4  NCT00000142  KG00250283
Total AE idx: 1
Labels:  (array([False,  True]), array([20448,  6166]))
Task: binary_or, Subtasks: 1
task_y.shape torch.Size([26614, 1])
enro


0it [00:00, ?it/s][A

batch_size: 128
gradient_accumulation_steps (for training): 1
effective batch_size (for training): 128
layer_sizes [-1, -1, -1]


In [24]:
sf_dataset_bk = deepcopy(sf_dataset)
sf_tasks_bk = deepcopy(sf_tasks)
sf_args_bk = deepcopy(sf_args)

In [34]:
def add_new_trial_to_safety_dataset(dataset):
    # Update the df
    dataset.df = dataset.df[dataset.df['split']=='test'].head(1)
    the_x = dataset.df.iloc[0]['x']
    the_kgid = dataset.df.iloc[0]['kgid']
    print('the_x', the_x, 'the_kgid', the_kgid)
    
    # Update the node features
    _node_feats = deepcopy(dataset.node_feats)
    pos = _node_feats[_node_feats['node_id'] == the_kgid].index[0]
    _node_feats.at[pos, 'emb'] = new_emb_1
    dataset.node_feats = _node_feats
    
    # Update the graph edges
    _graph = deepcopy(dataset.graph)
    new_edges, new_etypes = get_new_edges(new_trial_1, the_x)
    
    _edge_index = _graph.data.edge_index.clone()
    columns_with_the_x = _edge_index.eq(the_x).any(dim=0)
    _edge_index = _edge_index[:, ~columns_with_the_x]
    _edge_index = torch.cat([_edge_index, new_edges], dim=1)
    _graph.data.edge_index = _edge_index
    
    _edge_type = _graph.data.edge_type.clone()
    _edge_type = _edge_type[~columns_with_the_x]
    _edge_type = torch.cat([_edge_type, new_etypes], dim=0)
    _graph.data.edge_type = _edge_type
    
    dataset.graph = _graph
    
    # Make the dataset ready for inference
    x = dataset._get_data_x(dataset.df)
    print('x', x)
    tensors = [x]
    tensors.extend([dataset.task_ys[0][0].repeat(len(x), 1)])
    tensors.extend([dataset.sample_weight_masks[0][0].repeat(len(x), 1)])
    dataset.datasets['test'] = TensorDataset(*tensors)
    
    return dataset

#### Run inference for the new trial

In [27]:
sf_dataset = add_new_trial_to_safety_dataset(sf_dataset)
sf_args, sf_runner, sf_encoder \
    = prepare_runner(sf_args, sf_dataset, sf_encoder, sf_bert_encoder, sf_model)

_, y_test_pred, _ = model_inference(sf_runner, mode='test') 
print ('y_test_pred.size()', y_test_pred.size())

the_x 98231 the_kgid KG00136002
torch.Size([2, 60])
torch.Size([60])
x tensor([98231])
self.fixed_encoder False
self.args.bert_unfreeze_epoch 1
to_optimze [{'params': <generator object Module.parameters at 0x7f314d186350>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f314d186580>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f314d186e40>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f314d186d60>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f314d186eb0>, 'lr': 0}]


0it [00:00, ?it/s]

batch_size: 128
gradient_accumulation_steps (for training): 1
effective batch_size (for training): 128
layer_sizes [-1, -1, -1]


0it [13:08, ?it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.04it/s]

y_test_pred.size() torch.Size([1, 1])


#### Predicted probablity of safety concern:

In [31]:
print(y_test_pred[0].item())

0.019659381359815598


### Predict for the Efficacy task

In [19]:
(ef_dataset, ef_tasks), ef_encoder, ef_bert_encoder, ef_model, ef_args \
    = load_model('../data/models/efficacy_model_34l5ms9m/ckpt.pt')

args.data_path ../data/graph
Adding infer nodes and edges
Reading train triples....
False
Finished. Read 3388945 train triples.
Reading valid triples....
False
Finished. Read 188264 valid triples.
Reading test triples....
False
Finished. Read 188297 test triples.
(205809, 786)
TRIAL_ARM          205809
PROTEIN             15787
DRUG                14299
UMLS                 7651
DISEASE              5751
PRIMARY_OUTCOME      3048
Name: etype, dtype: int64

Final:
TRIAL_ARM          86899
PROTEIN            15787
DRUG               12133
UMLS                7645
DISEASE             5751
PRIMARY_OUTCOME     2960
Name: etype, dtype: int64
running _load_graph..
running _load_trial_features..
         trial        kgid
0  NCT00000134  KG00251195
1  NCT00000134  KG00251197
2  NCT00000134  KG00251196
3  NCT00000136  KG00249524
4  NCT00000142  KG00250283
Total AE idx: 1
building efficacy data...
Task: binary_pair_efficacy, Subtasks: 1
task_y.shape torch.Size([1486, 1])
efficacy split: setting 

In [32]:
ef_dataset_bk = deepcopy(ef_dataset)
ef_tasks_bk = deepcopy(ef_tasks)
ef_args_bk = deepcopy(ef_args)

#### Add new trial to inference dataset

In [35]:
def add_new_trial_to_efficacy_dataset(dataset):
    # Update the df
    _df = dataset.efficacy_df[dataset.efficacy_df['split']=='test'].head(1)
    dataset.efficacy_df = _df
    the_x1, the_x2 = _df.iloc[0]['x1'], _df.iloc[0]['x2']
    the_kgid1, the_kgid2 = _df.iloc[0]['kgid1'], _df.iloc[0]['kgid2']
    print('the_x1', the_x1, 'the_kgid1', the_kgid1, 'the_x2', the_x2, 'the_kgid2', the_kgid2)
    
    # Update the node features
    _node_feats = deepcopy(dataset.node_feats)
    pos = _node_feats[_node_feats['node_id'] == the_kgid1].index[0]
    _node_feats.at[pos, 'emb'] = new_emb_1
    pos = _node_feats[_node_feats['node_id'] == the_kgid2].index[0]
    _node_feats.at[pos, 'emb'] = new_emb_2
    dataset.node_feats = _node_feats
    
    # Update the graph edges
    _graph = deepcopy(dataset.graph)
    new_edges_1, new_etypes_1 = get_new_edges(new_trial_1, the_x1)
    new_edges_2, new_etypes_2 = get_new_edges(new_trial_2, the_x2)
    
    _edge_index = _graph.data.edge_index.clone()
    columns_with_the_x1 = _edge_index.eq(the_x1).any(dim=0)
    _edge_index = _edge_index[:, ~columns_with_the_x1]
    columns_with_the_x2 = _edge_index.eq(the_x2).any(dim=0)
    _edge_index = _edge_index[:, ~columns_with_the_x2]
    _edge_index = torch.cat([_edge_index, new_edges_1, new_edges_2], dim=1)
    _graph.data.edge_index = _edge_index
    
    _edge_type = _graph.data.edge_type.clone()
    _edge_type = _edge_type[~columns_with_the_x1]
    _edge_type = _edge_type[~columns_with_the_x2]
    _edge_type = torch.cat([_edge_type, new_etypes_1, new_etypes_2], dim=0)
    _graph.data.edge_type = _edge_type
    
    dataset.graph = _graph
    
    # Make the dataset ready for inference
    x = dataset._get_data_x(_df)
    print('x', x)
    tensors = [x]
    tensors.extend([dataset.task_ys[0][0].repeat(len(x), 1)])
    tensors.extend([dataset.sample_weight_masks[0][0].repeat(len(x), 1)])
    dataset.datasets['test'] = TensorDataset(*tensors)
    
    return dataset

In [36]:
ef_dataset = add_new_trial_to_efficacy_dataset(ef_dataset)
ef_args, ef_runner, ef_encoder \
    = prepare_runner(ef_args, ef_dataset, ef_encoder, ef_bert_encoder, ef_model)

_, y_test_pred, _ = model_inference(ef_runner, mode='test') 
print ('y_test_pred.size()', y_test_pred.size())

the_x1 119661 the_kgid1 KG00180752 the_x2 119660 the_kgid2 KG00180751
torch.Size([2, 60])
torch.Size([60])
torch.Size([2, 60])
torch.Size([60])
x tensor([[119661, 119660]])
self.fixed_encoder False
self.args.bert_unfreeze_epoch 1
to_optimze [{'params': <generator object Module.parameters at 0x7f31844aa580>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f31844aa0b0>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f31844aa430>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f31844aa190>, 'lr': 0}, {'params': <generator object Module.parameters at 0x7f31844aa200>, 'lr': 0}]



0it [00:00, ?it/s][A

batch_size: 128
gradient_accumulation_steps (for training): 1
effective batch_size (for training): 128
layer_sizes [-1, -1, -1]



  0%|                                                     | 0/1 [00:00<?, ?it/s][A
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.74it/s][A

y_test_pred.size() torch.Size([1, 1]) y_test_true.size() torch.Size([1, 1])


#### Predicted probability that trial arm 1 has better efficacy than trial arm 2:

In [37]:
print(y_test_pred[0].item())

0.47162944078445435
