In [1]:
import csv
import numpy as np
import sys
import pandas as pd
import itertools
import math
import time
import os

from sklearn import svm, linear_model, neighbors
from sklearn import tree, ensemble
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

import networkx as nx
import random
import numbers

from sklearn.model_selection import StratifiedKFold

from src import disjoint_cv

  from numpy.core.umath_tests import inner1d


# KEGG DDIs

In [2]:
ddi_df = pd.read_csv("data/input/ddi_kegg.txt", sep='\t')
ddi_df.head()

Unnamed: 0,Drug1,Drug2
0,DB00114,DB01235
1,DB00114,DB00413
2,DB00114,DB00810
3,DB00114,DB00494
4,DB00114,DB00323


In [3]:
featureFilename = "vectors/DB/RDF2Vec_sg_200_5_5_15_2_500_d5_uniform.txt"
embedding_df = pd.read_csv(featureFilename, delimiter='\t') 
embedding_df.Entity =embedding_df.Entity.str[-8:-1]
embedding_df.rename(columns={'Entity':'Drug'}, inplace=True)

In [4]:
commonDrugs, all_positives = disjoint_cv.getPositivePairs(ddi_df, embedding_df)

Drugs 816
Postive size: 19424


In [5]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

In [6]:
if False: 
    sc.stop()

config = SparkConf()
config.setMaster("local[10]")
config.set("spark.executor.memory", "70g")
config.set('spark.driver.memory', '90g')
config.set("spark.memory.offHeap.enabled",True)
config.set("spark.memory.offHeap.size","50g") 
sc = SparkContext(conf=config)
print (sc)

<SparkContext master=local[10] appName=pyspark-shell>


In [7]:
nb_model = GaussianNB()
lr_model = linear_model.LogisticRegression()
rf_model = ensemble.RandomForestClassifier(n_estimators=200, max_depth=8, n_jobs=-1)

clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]

In [18]:
n_fold =10 
n_run = 10 
n_proportion = 1
n_seed = 100

drugwise_runs, pairwise_runs = disjoint_cv.run_cv10(sc, clfs, embedding_df, commonDrugs, all_positives, n_fold, n_run, n_proportion, n_seed)

run 0
run 1
run 2
run 3
run 4
run 5
run 6
run 9


In [21]:
import os
folder= 'results/disjoint/'
if not os.path.isdir(folder):
    os.mkdir(folder)
drugwise_runs.to_csv(folder+ 'KEGG_DCV_runs_sg.csv')
pairwise_runs.to_csv(folder+'KEGG_PCV_runs_sg.csv')

In [24]:
drugwise_runs.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.645756,0.689302,0.628234,4.5,0.660263,0.601109,0.701222
Naive Bayes,0.650408,0.705184,0.63354,4.5,0.665007,0.606703,0.709116
Random Forest,0.709666,0.778186,0.677101,4.5,0.761322,0.610617,0.792456


# Twosides DDIs

In [25]:
ddi_df = pd.read_csv("data/input/ddi_twosides.txt", sep='\t')
ddi_df.head()

Unnamed: 0,Drug1,Drug2
0,DB00945,DB00526
1,DB00945,DB00458
2,DB00945,DB00996
3,DB00945,DB00454
4,DB00945,DB00213


In [26]:
commonDrugs, all_positives = disjoint_cv.getPositivePairs(ddi_df, embedding_df)

Drugs 533
Postive size: 47301


In [27]:
nb_model = GaussianNB()
lr_model = linear_model.LogisticRegression()
rf_model = ensemble.RandomForestClassifier(n_estimators=100, max_depth=8, n_jobs=-1)

clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]

In [None]:
n_fold =10 
n_run =10 
n_proportion = 1
n_seed = 100

drugwise_runs, pairwise_runs = disjoint_cv.run_cv10(sc, clfs, embedding_df, commonDrugs, all_positives, n_fold, n_run, n_proportion, n_seed)

run 0
run 1
run 2
run 3
run 4
run 5
run 6
run 7
run 8
run 9


In [None]:
folder= 'results/disjoint/'
if not os.path.isdir(folder):
    os.mkdir(folder)
drugwise_runs.to_csv(folder+ 'TWOSIDES_DCV_runs_sg.csv')
pairwise_runs.to_csv(folder+'TWOSIDES_PCV_runs_sg.csv')

In [31]:
drugwise_runs.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.733388,0.809773,0.727515,4.5,0.743591,0.714057,0.811636
Naive Bayes,0.735015,0.817105,0.730533,4.5,0.74301,0.719495,0.810523
Random Forest,0.748564,0.81376,0.730065,4.5,0.787507,0.681465,0.827556


In [32]:
pairwise_runs.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.672144,0.737015,0.656859,4.5,0.687525,0.635981,0.73877
Naive Bayes,0.708591,0.789882,0.699357,4.5,0.721602,0.684167,0.783451
Random Forest,0.664078,0.750148,0.589833,4.5,0.753335,0.491136,0.750145


# Drugbank 4

In [33]:
ddi_df = pd.read_csv("data/input/ddi_v4.txt", sep='\t')
ddi_df.head()

Unnamed: 0,Drug1,Drug2
0,DB00005,DB00026
1,DB00026,DB00005
2,DB00005,DB00065
3,DB00065,DB00005
4,DB00005,DB00072


In [34]:
commonDrugs, all_positives = disjoint_cv.getPositivePairs(ddi_df, embedding_df)

Drugs 1458
Postive size: 47077


In [35]:
nb_model = GaussianNB()
lr_model = linear_model.LogisticRegression()
rf_model = ensemble.RandomForestClassifier(n_estimators=100, max_depth=8, n_jobs=-1)

clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]

In [36]:
n_fold =10 
n_run =10 
n_proportion = 1
n_seed = 100

drugwise_runs, pairwise_runs = disjoint_cv.run_cv10(sc, clfs, embedding_df, commonDrugs, all_positives, n_fold, n_run, n_proportion, n_seed)

run 0
run 1
run 2
run 3
run 4
run 5
run 6
run 7
run 8
run 9


In [37]:
folder= 'results/disjoint/'
if not os.path.isdir(folder):
    os.mkdir(folder)
drugwise_runs.to_csv(folder+ 'DB4_DCV_runs_sg.csv')
pairwise_runs.to_csv(folder+'DB4_PCV_runs_sg.csv')

In [38]:
drugwise_runs.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.660592,0.704436,0.651452,4.5,0.66917,0.635764,0.718731
Naive Bayes,0.650573,0.699039,0.650479,4.5,0.650402,0.65156,0.705249
Random Forest,0.709124,0.776082,0.688135,4.5,0.740989,0.643173,0.784752


In [39]:
pairwise_runs.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.622942,0.659199,0.595135,4.5,0.640842,0.559842,0.672081
Naive Bayes,0.635789,0.677935,0.6295,4.5,0.639159,0.624792,0.687501
Random Forest,0.62228,0.686462,0.521107,4.5,0.708418,0.416456,0.696672


# Drugbank v5

In [40]:
ddi_df = pd.read_csv("data/input/ddi_v5.txt", sep='\t')
ddi_df.head()

Unnamed: 0,Drug1,Drug2
0,DB00001,DB01048
1,DB00001,DB00054
2,DB00001,DB06736
3,DB00001,DB01418
4,DB00001,DB00945


In [41]:
commonDrugs, all_positives = disjoint_cv.getPositivePairs(ddi_df, embedding_df)

Drugs 2124
Postive size: 253449


In [42]:
nb_model = GaussianNB()
lr_model = linear_model.LogisticRegression()
rf_model = ensemble.RandomForestClassifier(n_estimators=100, max_depth=8, n_jobs=-1)

clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]

In [None]:
n_fold =10 
n_run =10 
n_proportion = 1
n_seed = 100

drugwise_runs, pairwise_runs = disjoint_cv.run_cv10(sc, clfs, embedding_df, commonDrugs, all_positives, n_fold, n_run, n_proportion, n_seed)

run 0
run 1


In [None]:
folder= 'results/disjoint/'
if not os.path.isdir(folder):
    os.mkdir(folder)
drugwise_runs.to_csv(folder+ 'DB5_DCV_runs_sg.csv')
pairwise_runs.to_csv(folder+'DB5_PCV_runs_sg.csv')