In [48]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from numpy import loadtxt
import numpy as np
import pandas as pd
import pickle

## Testing on Test Set with Saved Ensemble Model

In [49]:
with open("Decision Tree Ensemble.pkl","rb") as file:
    model=pickle.load(file)

In [50]:
model

BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_features=0.2, max_samples=0.5, n_estimators=150,
                  n_jobs=-2, random_state=8)

In [51]:
X_test=loadtxt("X_test.csv",delimiter=",")
y_test=loadtxt("y_test.csv",delimiter=",")
print(len(X_test),len(X_test[0]))
print(len(y_test))

5070 278
5070


In [59]:
print ("Accuracy test score")
model.score(X_test,y_test)

Accuracy test score


0.7471400394477318

In [53]:
prediction=model.predict(X_test)

In [54]:
predict_prob=model.predict_proba(X_test)

In [58]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, predict_prob[:,1])
print("ROC_AUC test score")
print(roc_auc_score(y_test,predict_prob[:,1]))

ROC_AUC test score
0.822719248081105


In [128]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,prediction)

array([[1392, 1143],
       [ 139, 2396]], dtype=int64)

From the Confusion matrix, it is clear that the model has trouble with False Positive. 

The model can be further improved by adding more decision trees but because of computational restraints, this was not conducted.

## Predicting Target and Drugs 

In [82]:
DTI=pd.read_csv("d.csv",delimiter=",",header=None)
DTI.columns=["Drug","Target","Interactions"]

In [77]:
ESC_drug=loadtxt("ESC_drug.csv",delimiter=",")
ESC_target=loadtxt("ESC_target.csv",delimiter=",")
print(len(ESC_drug),len(ESC_drug[0]))
print(len(ESC_target),len(ESC_target[0]))

3348 56
3348 222


In [91]:
ESC_DTI=DTI.iloc[3224124:3227472].to_numpy()

In [92]:
ESC=np.concatenate((ESC_drug,ESC_target),axis=1)

In [105]:
ESC_Scores=np.concatenate((ESC_DTI,model.predict_proba(ESC)),axis=1)
ESC_final=pd.DataFrame(ESC_Scores)
ESC_final.columns=["Drug","Target","Interaction","0 Prob","1 Prob"]
ESC_final.sort_values(by="1 Prob",ascending=False,inplace=True)
ESC_final.iloc[0:30]

Unnamed: 0,Drug,Target,Interaction,0 Prob,1 Prob
2712,DB01175,Q7DB61,0,0.0466667,0.953333
2479,DB01175,Q16740,0,0.0533333,0.946667
1814,DB01175,P45494,0,0.06,0.94
3066,DB01175,Q9HBL8,0,0.06,0.94
58,DB01175,O08498,0,0.0666667,0.933333
2709,DB01175,Q7CL96,0,0.0666667,0.933333
2928,DB01175,Q96C36,0,0.0666667,0.933333
2530,DB01175,Q46856,0,0.0666667,0.933333
1487,DB01175,P29074,0,0.0733333,0.926667
775,DB01175,P0A6I9,0,0.0733333,0.926667


In [115]:
Lev_drug=loadtxt("lev_drug.csv",delimiter=",")
Lev_target=loadtxt("lev_target.csv",delimiter=",")
print(len(Lev_drug),len(Lev_drug[0]))

3348 56


In [118]:
Lev_DTI=DTI.iloc[3408264:3411612].to_numpy()

In [119]:
Lev=np.concatenate((Lev_drug,Lev_target),axis=1)

In [126]:
Lev_Scores=np.concatenate((Lev_DTI,model.predict_proba(Lev)),axis=1)
Lev_final=pd.DataFrame(Lev_Scores)
Lev_final.columns=["Drug","Target","Interaction","0 Prob","1 Prob"]
Lev_final.sort_values(by="1 Prob",ascending=False,inplace=True)
Lev_final.iloc[0:30]

Unnamed: 0,Drug,Target,Interaction,0 Prob,1 Prob
2797,DB01238,Q8GIQ0,0,0.0466667,0.953333
2658,DB01238,Q6N063,0,0.0533333,0.946667
2712,DB01238,Q7DB61,0,0.0533333,0.946667
3110,DB01238,Q9LCC8,0,0.06,0.94
2577,DB01238,Q55793,0,0.06,0.94
500,DB01238,P05067,0,0.06,0.94
2884,DB01238,Q8ZIV7,0,0.06,0.94
1405,DB01238,P25787,0,0.0666667,0.933333
523,DB01238,P05451,0,0.0666667,0.933333
3026,DB01238,Q9F0I5,0,0.0666667,0.933333


In [121]:
Arip_drug=loadtxt("arip_drug.csv",delimiter=",")
Arip_target=loadtxt("arip_target.csv",delimiter=",")
print(len(Arip_drug),len(Arip_drug[0]))

3348 56


In [123]:
Arip_DTI=DTI.iloc[3418308:3421656].to_numpy()

In [124]:
Arip=np.concatenate((Arip_drug,Arip_target),axis=1)

In [127]:
Arip_Scores=np.concatenate((Arip_DTI,model.predict_proba(Arip)),axis=1)
Arip_final=pd.DataFrame(Arip_Scores)
Arip_final.columns=["Drug","Target","Interaction","0 Prob","1 Prob"]
Arip_final.sort_values(by="1 Prob",ascending=False,inplace=True)
Arip_final.iloc[0:30]

Unnamed: 0,Drug,Target,Interaction,0 Prob,1 Prob
557,DB01238,P06732,0,0.04,0.96
2112,DB01238,P67910,0,0.0466667,0.953333
2712,DB01238,Q7DB61,0,0.0466667,0.953333
1814,DB01238,P45494,0,0.0466667,0.953333
3026,DB01238,Q9F0I5,0,0.0533333,0.946667
3288,DB01238,Q9X6R4,0,0.06,0.94
1488,DB01238,P29166,0,0.06,0.94
1420,DB01238,P26394,0,0.06,0.94
35,DB01238,O00469,0,0.06,0.94
46,DB01238,O06414,0,0.06,0.94


Although these predictions may not be the most accurate but it's because of the initial assumption that the negative samples are all strictly negative. In truth, there are possible interactions that are undiscovered. Hence some papers have used positive unlabeled methodology to approach it. 