# Prediction of drug-drug interaction using RDF2Vec

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_curve, auc,average_precision_score
import numpy
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import cross_validate
from scipy import interp
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import argparse
import gc
import os

## Read RDF2Vec features

In [3]:
def crossvalidation(model, X, y, nsplits=10):
    print(X.shape)
    print(y.shape)
    scoring = ['precision', 'recall', 'accuracy', 'roc_auc', 'f1', 'average_precision']
  
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    scores = cross_validate(model, X, y, scoring=scoring, n_jobs=1, cv=skf)
    scores_df = pd.DataFrame.from_dict(scores)
    gc.collect()
    return scores_df


def main(drugfeat, ddifile, random_state):

   
    print ("Processing file :",drugfeat)
    # ### Reading the RDF2Vec features
    
    embedding_df = pd.read_csv(drugfeat, delimiter='\t')

    #print('Size of the drugs that has RDF2Vec features: %d' % len(embedding_df))


    # ### Reading Drugbak v5.0 dataset
    drugbank_ddi = pd.read_csv(ddifile, delimiter='\t')


    print("Dataset size: %d" % len(drugbank_ddi)) # it should be 13892 (6946*2) but it doesn't affect the comparison procedure

    drugsInDrugbankDDI = set(drugbank_ddi['Drug1'].unique()).union(drugbank_ddi['Drug2'].unique())
    commonDrugs = drugsInDrugbankDDI.intersection(embedding_df.Drug.unique()).intersection(embedding_df.Drug.unique())

    print("Size of the drugs that appear in both DrugBank DDI dataset and drug2vec dataset: ", len(commonDrugs))

    import itertools
    pairs = []
    classes = []

    ddiKnown = set([tuple(x) for x in  drugbank_ddi[['Drug1','Drug2']].values])

    for comb in itertools.combinations(sorted(commonDrugs), 2):
            dr1 = comb[0]
            dr2 = comb[1]
            if (dr1,dr2)  in ddiKnown or  (dr2,dr1)  in ddiKnown:
                cls=1
            else:
                cls=0
            pairs.append((dr1,dr2))
            classes.append(cls)

    pairs = np.array(pairs)        
    classes = np.array(classes)

    indices = np.where(classes == 1)
    positives = pd.DataFrame(list(zip(pairs[indices][:,0],pairs[indices][:,1],classes[indices])), columns=['Drug1','Drug2','Class'])

    indices = np.where(classes == 0)
    negatives = pd.DataFrame(list(zip(pairs[indices][:,0],pairs[indices][:,1],classes[indices])), columns=['Drug1','Drug2','Class'])

    print("DDI size: ", len(positives))
    print("non-DDI size: ",len(negatives))

    negatives = negatives.sample(len(positives), random_state=random_state) # for balanced class

    train_df = pd.concat([positives,negatives], ignore_index=True)
    print("Train size: ", len(train_df))


    train_df_emd = train_df.merge(embedding_df, left_on='Drug1', right_on='Drug').merge(embedding_df, left_on='Drug2', right_on='Drug')


    # Free the memory by deleting variables that are not needed anymore
    del drugbank_ddi
    del positives, negatives
    del pairs, classes, indices, drugsInDrugbankDDI, commonDrugs

    gc.collect()

    features = train_df_emd.columns.difference(['Drug1','Drug2' ,'Class', 'Drug_x', 'Drug_y'])


    X = train_df_emd[features].values
    y = train_df_emd['Class'].values
    
    del train_df_emd
    gc.collect()

    print("Feature size: ", X.shape[1])


    drugs = set(train_df['Drug1'].unique()).union(train_df['Drug2'].unique())
    print (len(drugs)) # it should be 721


    # ### Naive Bayes
    # There is no hyper-parameter to tune. 
    nb_model = GaussianNB()
    print ("Naive Bayes")
    nb_scores_df = crossvalidation(nb_model, X, y, nsplits=10) 


    # ### Logistic Regression
    # **Training with best parameters:**
    # Value for C parameter was selected as 0.01

    logistic_model = LogisticRegression(C=0.01)

    print ("Logistic Regression")
    lr_scores_df = crossvalidation(logistic_model, X, y, nsplits=10) 
    
    # ### Random Forest
    # **Training with the best parameters:**

    rf_model = RandomForestClassifier(n_estimators=200, n_jobs=-1)

    print ("Random Forest")
    #rf_results_pred = None
    rf_scores_df = crossvalidation(rf_model, X, y, nsplits=10) 
     
    return nb_scores_df, lr_scores_df, rf_scores_df

### Train with each embedding features and output the results of classifiers (NB, LR, RF)

 ### DRUGBANK DATASET

In [None]:
foldername = 'vectors/DRUGBANK/' 
ddi_file ='data/input/ddi_v5.txt'
outfolder='Results/DRUGBANK/'

n_runs=10

for fn in os.listdir(foldername):
    emdfile = os.path.join(foldername, fn)
    nb_scores_df = pd.DataFrame()
    lr_scores_df = pd.DataFrame()
    rf_scores_df = pd.DataFrame()
    for k in range(n_runs): 
        nb_scores, lr_scores, rf_scores = main(emdfile, ddi_file, random_state=k)
        nb_scores_df = nb_scores_df.append(nb_scores.mean(), ignore_index=True)
        lr_scores_df = lr_scores_df.append(lr_scores.mean(), ignore_index=True)
        rf_scores_df = rf_scores_df.append(rf_scores.mean(), ignore_index=True)
    
    nb_scores_df.to_csv(outfolder+fn[:-4]+'_nb_results_pred.csv')
    lr_scores_df.to_csv(outfolder+fn[:-4]+'_lr_results_pred.csv')
    rf_scores_df.to_csv(outfolder+fn[:-4]+'_rf_results_pred.csv')
    

Processing file : vectors/DRUGBANK/Drug2Vec_cbow_200_5_5_2_500_uniform.txt
Dataset size: 577712
Size of the drugs that appear in both DrugBank DDI dataset and drug2vec dataset:  2124
DDI size:  253449
non-DDI size:  2001177
Train size:  506898
Feature size:  400
2124
Naive Bayes
(506898, 400)
(506898,)
Logistic Regression
(506898, 400)
(506898,)
Random Forest
(506898, 400)
(506898,)
Processing file : vectors/DRUGBANK/Drug2Vec_cbow_200_5_5_2_500_uniform.txt
Dataset size: 577712
Size of the drugs that appear in both DrugBank DDI dataset and drug2vec dataset:  2124
DDI size:  253449
non-DDI size:  2001177
Train size:  506898
Feature size:  400
2124
Naive Bayes
(506898, 400)
(506898,)
Logistic Regression
(506898, 400)
(506898,)
Random Forest
(506898, 400)
(506898,)
Processing file : vectors/DRUGBANK/Drug2Vec_cbow_200_5_5_2_500_uniform.txt
Dataset size: 577712
Size of the drugs that appear in both DrugBank DDI dataset and drug2vec dataset:  2124
DDI size:  253449
non-DDI size:  2001177
Trai

 ### INTEGRATED (DRUGBANK, KEGG, PHARMGKB) DATASET

In [None]:

foldername =  'vectors/DB_KEGG_PGK/'
ddi_file ='data/input/ddi_v5.txt'
outfolder='Results/DB_KEGG_PGK/'
scores_runs = pd.DataFrame()

for fn in os.listdir(foldername):
    emdfile = os.path.join(foldername, fn)
    if 'trans' not in fn: continue
    nb_results_pred, lr_results_pred, rf_results_pred = main(emdfile, ddi_file, './')
    nb_results_pred.to_csv(outfolder+fn[:-4]+'_nb_results_pred.csv')
    lr_results_pred.to_csv(outfolder+fn[:-4]+'_lr_results_pred.csv')
    rf_results_pred.to_csv(outfolder+fn[:-4]+'_rf_results_pred.csv')
