In [1]:
import csv
import numpy as np
import sys
import pandas as pd
import itertools
import math
import time

from sklearn import svm, linear_model, neighbors
from sklearn import tree, ensemble
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

import networkx as nx
import random
import numbers

from sklearn.model_selection import StratifiedKFold

from src import ml

  from numpy.core.umath_tests import inner1d


In [2]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext

In [3]:
if False: 
    sc.stop()

config = SparkConf()
config.setMaster("local[10]")
config.set("spark.executor.memory", "70g")
config.set('spark.driver.memory', '90g')
config.set("spark.memory.offHeap.enabled",True)
config.set("spark.memory.offHeap.size","50g") 
sc = SparkContext(conf=config)
print (sc)

<SparkContext master=local[10] appName=pyspark-shell>


# DRUGBANK 4.0

In [4]:
ddi_df = pd.read_csv("data/input/ddi_v4.txt", sep='\t')
ddi_df.head()

Unnamed: 0,Drug1,Drug2
0,DB00005,DB00026
1,DB00026,DB00005
2,DB00005,DB00065
3,DB00065,DB00005
4,DB00005,DB00072


In [5]:
featureFilename = "vectors/DB/RDF2Vec_sg_200_5_5_15_2_500_d5_uniform.txt"
embedding_df = pd.read_csv(featureFilename, delimiter='\t') 
embedding_df.Entity =embedding_df.Entity.str[-8:-1]
embedding_df.rename(columns={'Entity':'Drug'}, inplace=True)

In [6]:
pairs, classes = ml.generatePairs(ddi_df, embedding_df)

In [7]:
nb_model = GaussianNB()
lr_model = linear_model.LogisticRegression()
rf_model = ensemble.RandomForestClassifier(n_estimators=200, max_depth=8, n_jobs=-1)

clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]

In [8]:
n_seed =100
n_fold =10 
n_run =10 
n_proportion = 1,
all_scores_df = ml.kfoldCV(sc, pairs, classes, embedding_df, clfs, n_run, n_fold, n_proportion, n_seed)


+/-: 47077 47077 1015076
run 0
+/-: 47077 47077 1015076
run 1
+/-: 47077 47077 1015076
run 2
+/-: 47077 47077 1015076
run 3
+/-: 47077 47077 1015076
run 4
+/-: 47077 47077 1015076
run 5
+/-: 47077 47077 1015076
run 6
+/-: 47077 47077 1015076
run 7
+/-: 47077 47077 1015076
run 8
+/-: 47077 47077 1015076
run 9


In [10]:
all_scores_df.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.696858,0.748784,0.699954,4.5,0.692892,0.707192,0.76272
Naive Bayes,0.659448,0.711984,0.663262,4.5,0.65592,0.670797,0.716468
Random Forest,0.76847,0.836724,0.772115,4.5,0.760148,0.784502,0.843221


In [11]:
all_scores_df.to_csv('results/traditional/DB4_TCV_run_sg.csv',sep=',', index=False)

# TWOSIDES 

In [12]:
ddi_df = pd.read_csv("data/input/ddi_twosides.txt", sep='\t')
ddi_df.head()

Unnamed: 0,Drug1,Drug2
0,DB00945,DB00526
1,DB00945,DB00458
2,DB00945,DB00996
3,DB00945,DB00454
4,DB00945,DB00213


In [13]:
len(ddi_df)

97168

In [14]:
len( set(ddi_df.Drug1.unique()).union(ddi_df.Drug2.unique()) )

548

In [15]:
featureFilename = "vectors/DB/RDF2Vec_sg_200_5_5_15_2_500_d5_uniform.txt"
embedding_df = pd.read_csv(featureFilename, delimiter='\t') 
embedding_df.Entity =embedding_df.Entity.str[-8:-1]
embedding_df.rename(columns={'Entity':'Drug'}, inplace=True)

In [16]:
pairs, classes = ml.generatePairs(ddi_df, embedding_df)

In [17]:
n_seed =100
n_fold =10 
n_run =10 
n_proportion = 1,
all_scores_df = ml.kfoldCV(sc, pairs, classes, embedding_df, clfs, n_run, n_fold, n_proportion, n_seed)

+/-: 47301 47301 94477
run 0
+/-: 47301 47301 94477
run 1
+/-: 47301 47301 94477
run 2
+/-: 47301 47301 94477
run 3
+/-: 47301 47301 94477
run 4
+/-: 47301 47301 94477
run 5
+/-: 47301 47301 94477
run 6
+/-: 47301 47301 94477
run 7
+/-: 47301 47301 94477
run 8
+/-: 47301 47301 94477
run 9


In [18]:
all_scores_df.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.805453,0.893241,0.806192,4.5,0.803147,0.809281,0.888721
Naive Bayes,0.752422,0.838159,0.751185,4.5,0.75498,0.74746,0.829959
Random Forest,0.798184,0.883046,0.798963,4.5,0.795914,0.802074,0.882323


In [19]:
all_scores_df.to_csv('results/traditional/TWOSIDES_TCV_run_sg.csv',sep=',', index=False)

# KEGG

In [20]:
ddi_df = pd.read_csv("data/input/ddi_kegg.txt", sep='\t')
ddi_df.head()

Unnamed: 0,Drug1,Drug2
0,DB00114,DB01235
1,DB00114,DB00413
2,DB00114,DB00810
3,DB00114,DB00494
4,DB00114,DB00323


In [21]:
len(ddi_df)

40540

In [22]:
len( set(ddi_df.Drug1.unique()).union(ddi_df.Drug2.unique()) )

864

In [23]:
featureFilename = "vectors/DB/RDF2Vec_sg_200_5_5_15_2_500_d5_uniform.txt"
embedding_df = pd.read_csv(featureFilename, delimiter='\t') 
embedding_df.Entity =embedding_df.Entity.str[-8:-1]
embedding_df.rename(columns={'Entity':'Drug'}, inplace=True)

In [24]:
pairs, classes = ml.generatePairs(ddi_df, embedding_df)

In [25]:
nb_model = GaussianNB()
logistic_model = linear_model.LogisticRegression(C=0.01)
rf_model = ensemble.RandomForestClassifier(n_estimators=200,  max_depth=8, n_jobs=-1)
clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]

In [26]:
n_seed =100
n_fold =10 
n_run =10 
n_proportion = 1,
all_scores_df = ml.kfoldCV(sc, pairs, classes, embedding_df, clfs, n_run, n_fold, n_proportion, n_seed)

+/-: 19424 19424 313096
run 0
+/-: 19424 19424 313096
run 1
+/-: 19424 19424 313096
run 2
+/-: 19424 19424 313096
run 3
+/-: 19424 19424 313096
run 4
+/-: 19424 19424 313096
run 5
+/-: 19424 19424 313096
run 6
+/-: 19424 19424 313096
run 7
+/-: 19424 19424 313096
run 8
+/-: 19424 19424 313096
run 9


In [27]:
all_scores_df.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.711738,0.769883,0.713828,4.5,0.708712,0.7191,0.782795
Naive Bayes,0.671322,0.732875,0.665936,4.5,0.677043,0.655292,0.735794
Random Forest,0.784519,0.851562,0.786984,4.5,0.778127,0.796128,0.863305


In [28]:
all_scores_df.to_csv('results/traditional/KEGG_TCV_run_sg.csv',sep=',', index=False)

# Drugbank 5

In [29]:
ddi_df = pd.read_csv("data/input/ddi_v5.txt", sep='\t')
ddi_df.head()

Unnamed: 0,Drug1,Drug2
0,DB00001,DB01048
1,DB00001,DB00054
2,DB00001,DB06736
3,DB00001,DB01418
4,DB00001,DB00945


In [30]:
featureFilename = "vectors/DB/RDF2Vec_sg_200_5_5_15_2_500_d5_uniform.txt"
embedding_df = pd.read_csv(featureFilename, delimiter='\t') 
embedding_df.Entity =embedding_df.Entity.str[-8:-1]
embedding_df.rename(columns={'Entity':'Drug'}, inplace=True)

In [31]:
len(ddi_df)

577712

In [32]:
len( set(ddi_df.Drug1.unique()).union(ddi_df.Drug2.unique()) )

2551

In [33]:
pairs, classes = ml.generatePairs(ddi_df, embedding_df)

In [34]:
nb_model = GaussianNB()
logistic_model = linear_model.LogisticRegression(C=0.01)
rf_model = ensemble.RandomForestClassifier(n_estimators=200,  max_depth=8, n_jobs=-1)
clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]

In [35]:
n_seed =100
n_fold =10 
n_run =10 
n_proportion = 1,
all_scores_df = ml.kfoldCV(sc, pairs, classes, embedding_df, clfs, n_run, n_fold, n_proportion, n_seed)

+/-: 253449 253449 2001177
run 0
+/-: 253449 253449 2001177
run 1
+/-: 253449 253449 2001177
run 2
+/-: 253449 253449 2001177
run 3
+/-: 253449 253449 2001177
run 4
+/-: 253449 253449 2001177
run 5
+/-: 253449 253449 2001177
run 6
+/-: 253449 253449 2001177
run 7
+/-: 253449 253449 2001177
run 8
+/-: 253449 253449 2001177
run 9


In [36]:
all_scores_df.groupby(['method','run']).mean().groupby('method').mean()

Unnamed: 0_level_0,accuracy,average_precision,f1,fold,precision,recall,roc_auc
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.717088,0.78879,0.718784,4.5,0.714502,0.723125,0.78958
Naive Bayes,0.679526,0.756393,0.682259,4.5,0.67649,0.688134,0.74314
Random Forest,0.765631,0.845983,0.763037,4.5,0.771574,0.754696,0.843604


In [37]:
all_scores_df.to_csv('results/traditional/DBv5_TCV_run_sg.csv',sep=',', index=False)

# Drugbankv5 - RDF2Vec CBOW 

In [39]:
featureFilename = "vectors/DB/RDF2Vec_cbow_200_5_5_2_500_d5_uniform.txt"
embedding_df = pd.read_csv(featureFilename, delimiter='\t') 
embedding_df.Entity =embedding_df.Entity.str[-8:-1]
embedding_df.rename(columns={'Entity':'Drug'}, inplace=True)

In [40]:
pairs, classes = ml.generatePairs(ddi_df, embedding_df)

In [41]:
nb_model = GaussianNB()
logistic_model = linear_model.LogisticRegression(C=0.01)
rf_model = ensemble.RandomForestClassifier(n_estimators=200,  max_depth=8, n_jobs=-1)
clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]

In [None]:
n_seed =100
n_fold =10 
n_run =10 
n_proportion = 1,
all_scores_df = ml.kfoldCV(sc, pairs, classes, embedding_df, clfs, n_run, n_fold, n_proportion, n_seed)

+/-: 253449 253449 2001177
run 0
+/-: 253449 253449 2001177
run 1
+/-: 253449 253449 2001177
run 2
+/-: 253449 253449 2001177
run 3
+/-: 253449 253449 2001177
run 4
+/-: 253449 253449 2001177
run 5


In [None]:
all_scores_df.groupby(['method','run']).mean().groupby('method').mean()

In [None]:
all_scores_df.to_csv('results/traditional/DBv5_TCV_run_cbow.csv',sep=',', index=False)