# **Initialization**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd '/content/drive/MyDrive/DSML_LAB_PROJECT/Project_Files'

/content/drive/MyDrive/DSML_LAB_PROJECT/Project_Files


In [3]:
!pip uninstall -y torch torchvision 
!pip install torch==1.1.0   torchvision==0.3.0
!pip install torch-cluster==1.4.2
!pip install torch-geometric==1.3.0
!pip install torch-scatter==1.3.1
!pip install torch-sparse==0.4.0 
!pip install scikit-learn==0.20.3
!pip install pytorch-memlab==0.0.3
%load_ext google.colab.data_table

Uninstalling torch-1.7.0+cu101:
  Successfully uninstalled torch-1.7.0+cu101
Uninstalling torchvision-0.8.1+cu101:
  Successfully uninstalled torchvision-0.8.1+cu101
Collecting torch==1.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/69/60/f685fb2cfb3088736bafbc9bdbb455327bdc8906b606da9c9a81bae1c81e/torch-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (676.9MB)
[K     |████████████████████████████████| 676.9MB 22kB/s 
[?25hCollecting torchvision==0.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/2e/45/0f2f3062c92d9cf1d5d7eabd3cae88cea9affbd2b17fb1c043627838cb0a/torchvision-0.3.0-cp36-cp36m-manylinux1_x86_64.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 6.9MB/s 
Installing collected packages: torch, torchvision
Successfully installed torch-1.1.0 torchvision-0.3.0
Collecting torch-cluster==1.4.2
[?25l  Downloading https://files.pythonhosted.org/packages/33/38/60ad2fcb735123429b3e0b165a19c80c6273d679b01d6550782abcb314e2/torch_cluster-1.4.2.tar.gz 

In [4]:
from data.utils import load_data_torch, process_prot_edge
from torch_geometric.data import Data
from src.utils import process_edges
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import numpy as np
import csv
import torch
import os

torch.manual_seed(1111)
np.random.seed(1111)

!!! Note: make sure the scipy and torch packages have also been installed properly to run this notebook.

# **Load index, data and results**

### Id-index map loading

In [5]:
# drug id - index
with open('data/index_map/drug-map.pkl', 'rb') as f:
    drug_map = pickle.load(f)
inv_drug_map = {v: k for k, v in drug_map.items()}

# combo id - index
with open('data/index_map/combo_map.pkl', 'rb') as f:
    combo_map = pickle.load(f)
inv_combo_map = {v: k for k, v in combo_map.items()}

### Generate / use polypharmacy side effect id-name map

In [6]:
######################################################
# generate polypharmacy side effect id - name map
# combo_name_map = {}
# with open('../data/index_map/bio-decagon-combo.csv', 'r') as f:
#     reader = csv.reader(f)
#     next(reader)
#     for _, _, id, name in reader:
#         id = int(id.split('C')[-1])
#         combo_name_map[id] = name

# # save map
# with open('../data/index_map/combo-name-map.pkl', 'wb') as f:
#     pickle.dump(combo_name_map, f)

# use map
with open('data/index_map/combo-name-map.pkl', 'rb') as f:
    combo_name_map = pickle.load(f)
inv_combo_name_map = {v: k for k, v in combo_name_map.items()}

### Data loading with selected d-d edge labels

In [7]:
# selected-drug idx - drug idx
with open('data/decagon_et.pkl', 'rb') as f:   # the whole dataset
    et_list = pickle.load(f)
inv_et_list = {et_list[i]: i for i in range(len(et_list))}

# load training data
feed_dict = load_data_torch("data/", et_list, mono=True)
data = Data.from_dict(feed_dict)

loading data
remove  0  isolated drugs:  []
remove finished
1097  polypharmacy side effects
data has been loaded


In [8]:
import itertools
from src.utils import * 

number_of_drugs = 645
number_of_side_effects = 137
number_of_drug_combos = number_of_drugs*(number_of_drugs-1)/2

iteration = 8
initial = (iteration - 1) * number_of_side_effects
final = (iteration) * number_of_side_effects

if iteration == 8:
  final = final + 1
  number_of_side_effects = number_of_side_effects + 1

data.test_et = torch.zeros([int(number_of_side_effects * number_of_drug_combos)], dtype=torch.int64)
data.test_idx = torch.zeros([2, int(number_of_side_effects * number_of_drug_combos)], dtype=torch.int64)

data.test_idx[0], data.test_idx[1], data.test_et = compute_side_effect_drug_pair_combinations(number_of_drugs, number_of_side_effects, number_of_drug_combos, initial, final)

print(data.test_et.shape)

torch.Size([28661220])


# **Performance Comparison**

In [9]:
# model loading
models = {  'DistMult' : 'df_distmult',
            'R-GCN': 'ddm-df_rgcn', 
            'dTIP-D': 'ddm-nn', 
            'dTIP-p': 'ppm-ggm-nn', 
            'TIP-cat': 'tip-cat',
            'TIP-sum': 'tip-add'    }

# loading function for recorded test scores druging training
def get_test_out(model_name):
    with open('output/'+ models[model_name] +'/test_out.pkl', 'rb') as f:
        record = pickle.load(f)
    return record

In [10]:
# get averaged auprc scores for each epoches
scores = dict()
prc_final = dict()
for model in models.keys():
    out = get_test_out(model)
    scores[model] = out
    prc_final[model] = out[99][0]

In [11]:
# print final AUPRC scores  for each models
lines = '---------------------------------------------'
print(lines)
print('|{:10s}|'.format('AUPRC scores for all the model variants'))
print(lines)
formats = '|{:14s}|{:28.3f}|'
for model, scores in prc_final.items():
    print(formats.format(model, scores))
print(lines)

---------------------------------------------
|AUPRC scores for all the model variants|
---------------------------------------------
|DistMult      |                       0.844|
|R-GCN         |                       0.888|
|dTIP-D        |                       0.885|
|dTIP-p        |                       0.778|
|TIP-cat       |                       0.889|
|TIP-sum       |                       0.889|
---------------------------------------------


# **Model Characteristics and Relablility** 

### The side effects evaluated in Zitnik et al. (2018)
The side effects with the top 10 best performance and 10 best performance, according to averaged auprc scores

In [12]:
# ######################################################
# side effect name - original index reported in decagon
decagon_best_name = ["Mumps", "Carbuncle", "Coccydynia", "Tympanic membrane perfor", "Dyshidrosis", "Spondylosis", "Schizoaffective disorder", "Breast dysplasia", "Ganglion", "Uterine polyp"]
decagon_worst_name = ["Bleeding", "Body temperature increased",  "Emesis", "Renal disorder", "Leucopenia", "Diarrhea", "Icterus", "Nausea", "Itch", "Anaemia"]
decagon_best_org_id = [26780, 7078, 9193, 206504, 32633, 38019, 36337, 16034, 1258666, 156369]
decagon_worst_org_id = [19080, 15967, 42963, 22658, 23530, 11991, 22346, 27497, 33774, 2871]

# get index
decagon_best_idx = [inv_et_list[combo_map[i]] for i in decagon_best_org_id]
decagon_worst_idx = [inv_et_list[combo_map[i]] for i in decagon_worst_org_id]

### The side effects evaluated in our work
The side effects with the top 20 best and 20 worst performance, according to averaged auprc scores.

In [14]:
# ######################################################
# Evaluation
name = 'TIP-sum (PPM-GGM-DDM-DF-Sum)'
lines = '-------------------------------------------------------------------------------------------------------'

with open('output/tip-add/test_record.pkl', 'rb') as f:
    dist_record = pickle.load(f)
auprc = np.array(dist_record[len(dist_record)-1])[0, :]
sorted_idx = np.argsort(auprc, kind='quicksort')

print(lines)
print(' {:37s}   {:6s}| {:45s}  {:6s}'.format('The Highest AUPRC Score', '  Edge', 'The Lowest AUPRC Score', '   Edge'))
print(lines)

for i in range(20):
    print(' {:30s} {:7.4f}  {:6d}| {:38s} {:7.4f}  {:6d}'.format(
        combo_name_map[inv_combo_map[et_list[sorted_idx[-(i+1)]]]], auprc[sorted_idx[-(i+1)]], feed_dict['dd_adj_list'][-(i+1)].nnz,
        combo_name_map[inv_combo_map[et_list[sorted_idx[i]]]], auprc[sorted_idx[i]], feed_dict['dd_adj_list'][i].nnz))
print(lines)

decag_best_in_us = [962 - np.where(sorted_idx == i)[0] for i in decagon_best_idx]
decag_worst_in_us = [np.where(sorted_idx == i)[0] for i in decagon_worst_idx]

-------------------------------------------------------------------------------------------------------
 The Highest AUPRC Score                   Edge| The Lowest AUPRC Score                            Edge
-------------------------------------------------------------------------------------------------------
 Corneal dystrophy               0.9843     293| enterocolitis                           0.7578     387
 heat rash                       0.9812     361| neonatal respiratory distress syndrome  0.7655   12062
 prostatism                      0.9805     339| dyspareunia                             0.7747     356
 periostitis                     0.9795     303| Feeling unwell                          0.7806   19930
 cystic acne                     0.9780     259| hypertrichosis                          0.7810   12309
 Legionella                      0.9766     355| thrombocytopenia                        0.7872   14192
 arterial insufficiency          0.9753     327| acute kidney fa

# **Compiling predicitions across the models - Tabulating the results**

## Loading predictions from pickle, and creating a Dataframe from it 

In [None]:
import numpy as np

head = ['df_distmult', 'ddm-df_rgcn','ddm-nn','ppm-ggm-nn','tip-cat', 'tip-add']

start = initial * int(number_of_drug_combos)
end = final * int(number_of_drug_combos)

In [None]:
with open('output/evaluation/df_distmult.pkl', 'rb') as f:
  score1 = np.array(pickle.load(f)[start:end])

In [None]:
with open('output/evaluation/ddm-df_rgcn.pkl', 'rb') as f:
  score2 = np.array(pickle.load(f)[start:end])

In [None]:
with open('output/evaluation/ddm-nn.pkl', 'rb') as f:
  score3 = np.array(pickle.load(f)[start:end])

In [None]:
with open('output/evaluation/ppm-ggm-nn.pkl', 'rb') as f:
  score4 = np.array(pickle.load(f)[start:end])

In [None]:
with open('output/evaluation/tip-cat.pkl', 'rb') as f:
  score5 = np.array(pickle.load(f)[start:end])

In [None]:
with open('output/evaluation/tip-add.pkl', 'rb') as f:
  score6 = np.array(pickle.load(f)[start:end])

In [None]:
# side effect info - index and name
side_effect_idx = [inv_combo_map[i] for i in data.test_et.tolist()]
side_effect_name = [combo_name_map[i] for i in side_effect_idx]

In [None]:
# drug info - PubChem index
drug1_cid = [inv_drug_map[i] for i in data.test_idx[0].tolist()]
drug2_cid = [inv_drug_map[i] for i in data.test_idx[1].tolist()]

In [None]:
# sub-table construction
df = pd.DataFrame({'Side_Effect_Index': data.test_et, 
                  'Side_Effect_Unique_ID': side_effect_idx,
                  'Side_Effect_Name': side_effect_name,
                  'Drug1_Index': data.test_idx[0],
                  'Drug1_Unique_ID': drug1_cid,
                  'Drug2_Index': data.test_idx[1], 
                  'Drug2_Unique_ID': drug2_cid,
                  'Prob_DF': score1,
                  'Prob_DDM_DF': score2,
                  'Prob_DDM_NN': score3,
                  'Prob_PPM_GGM_NN': score4,
                  'Prob_PPM_GGM_DDM_DF_Cat': score5,
                  'Prob_PPM_GGM_DDM_DF_Sum': score6
                })
df

Unnamed: 0,Side_Effect_Index,Side_Effect_Unique_ID,Side_Effect_Name,Drug1_Index,Drug1_Unique_ID,Drug2_Index,Drug2_Unique_ID,Prob_DF,Prob_DDM_DF,Prob_DDM_NN,Prob_PPM_GGM_NN,Prob_PPM_GGM_DDM_DF_Cat,Prob_PPM_GGM_DDM_DF_Sum
0,959,8924,cleft lip,0,2173,1,3345,0.500000,4.654672e-01,0.523101,5.000000e-01,4.735912e-01,5.403015e-01
1,959,8924,cleft lip,0,2173,2,5206,0.500000,3.349866e-02,0.075150,5.000000e-01,3.993516e-02,5.440357e-02
2,959,8924,cleft lip,0,2173,3,9433,0.043054,3.334496e-03,0.060729,5.000000e-01,3.314328e-03,6.755470e-03
3,959,8924,cleft lip,0,2173,4,3929,0.506104,5.733842e-02,0.189698,5.000000e-01,6.859240e-02,9.965137e-02
4,959,8924,cleft lip,0,2173,5,150610,0.500000,2.421660e-03,0.035472,5.000000e-01,8.205515e-04,2.305365e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28661215,1096,9952,febrile convulsion,641,9571074,643,2182,0.106131,7.552808e-11,0.000027,1.586276e-03,2.192506e-08,6.095405e-11
28661216,1096,9952,febrile convulsion,641,9571074,644,5482,0.053796,7.156607e-24,0.000027,5.000000e-01,1.866621e-08,5.165802e-13
28661217,1096,9952,febrile convulsion,642,4011,643,2182,0.069542,6.588929e-12,0.000006,3.954704e-09,1.060029e-08,1.278976e-10
28661218,1096,9952,febrile convulsion,642,4011,644,5482,0.286108,2.617764e-26,0.000006,2.489113e-06,2.096570e-08,5.809854e-12


## Writing results to CSV files in 2 chunks based on the iteration 

In [None]:
if iteration == 1: # use this for the first iteration - save directly to csv file
  df.to_csv('analysis/evaluation_table_1.csv', index=False)

else if iteration <= 6: #use this otherwise - append to existing csv 
  df.to_csv('analysis/evaluation_table_1.csv', mode='a', header=False, index=False)

else if iteration==7:
  df.to_csv('analysis/evaluation_table_2.csv', index=False)

else:
  df.to_csv('analysis/evaluation_table_2.csv', mode='a', header=False, index=False)

## Combining the CSV chunk files 

In [None]:
import pandas as pd

In [None]:
df2 = pd.read_csv('analysis/evaluation_table_2.csv')

In [None]:
df2.describe()

Unnamed: 0,Side_Effect_Index,Side_Effect_Unique_ID,Drug1_Index,Drug1_Unique_ID,Drug2_Index,Drug2_Unique_ID,Prob_DF,Prob_DDM_DF,Prob_DDM_NN,Prob_PPM_GGM_NN,Prob_PPM_GGM_DDM_DF_Cat,Prob_PPM_GGM_DDM_DF_Sum
count,57114750.0,57114750.0,57114750.0,57114750.0,57114750.0,57114750.0,57114750.0,57114750.0,57114750.0,57114750.0,57114750.0,57114750.0
mean,959.0,117608.3,214.3333,332075.2,429.6667,330682.3,0.3086848,0.2006216,0.2368004,0.424271,0.1963766,0.2003365
std,79.38514,226777.8,151.9097,1236875.0,151.9097,1271347.0,0.2620838,0.2729678,0.2783737,0.2439187,0.2717136,0.2772972
min,822.0,768.0,0.0,85.0,1.0,85.0,2.363076e-12,0.0,8.496277e-10,0.0,2.228922e-36,2.215905e-32
25%,890.0,19284.0,86.0,3222.0,322.0,3008.0,0.0582528,0.003004166,0.01824277,0.2475053,0.003303863,0.002585272
50%,959.0,32580.0,188.0,4506.0,456.0,4440.0,0.2601454,0.05108992,0.1033927,0.5,0.04716669,0.04548503
75%,1028.0,151463.0,322.0,9433.0,558.0,27991.0,0.5,0.3226005,0.391423,0.5,0.3090489,0.3213737
max,1096.0,1527411.0,643.0,9571074.0,644.0,9571074.0,0.9995364,0.9874603,0.9978272,0.999529,0.989592,0.995514


In [None]:
df2.to_csv('analysis/evaluation_table_1.csv', mode='a', index=False, header=False)