In [165]:
import numpy
from read_data import *
from inference import *
from interaction_labelling import *
from feature_generation import *
from model import *
from helper import *
from sklearn.model_selection import StratifiedKFold

<b> Import Data - list of drugs and their corresponsding Smiles and drug-drug interaction

In [9]:
drug_list, smiles_dict = read_from_file('../data/sample/full_database.xml')



In [10]:

interactions = generate_interactions(drug_list, smiles_dict)


{'bivalirudin': 'CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC(O)=O)NC(=O)CNC(=O)[C@H](CC(N)=O)NC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)CC1=CC=CC=C1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(O)=O)C(=O)N[C@@H](CCC(O)=O)C(=O)N[C@@H](CC1=CC=C(O)C=C1)C(=O)N[C@@H](CC(C)C)C(O)=O',
 'leuprolide': 'CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](CC(C)C)NC(=O)[C@H](CC1=CC=C(O)C=C1)NC(=O)[C@H](CO)NC(=O)[C@H](CC1=CNC2=C1C=CC=C2)NC(=O)[C@H](CC1=CNC=N1)NC(=O)[C@@H]1CCC(=O)N1',
 'goserelin': 'CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H](CC1=CC=C(O)C=C1)NC(=O)[C@H](CO)NC(=O)[C@H](CC1=CNC2=CC=CC=C12)NC(=O)[C@H](CC1=CN=CN1)NC(=O)[C@@H]1CCC(=O)N1)C(=O)N[C@@H](CCCN=C(N)N)C(=O)N1CCC[C@H]1C(=O)NNC(N)=O',
 'gramicidin d': 'CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(=O)N[C@@H](C)C(=O)N[C@H](C(C)C)C(=O)N[C@@H](C(C)C)C(=O)N[C@H](C(C)C)C(=O)N[C@@H](CC1=CNC2=C1C=CC=C2)C(=O)N[C@H](C

<b> Data Cleansing & Label Creation

In [11]:
relation_list = generate_relations(interactions)

In [12]:
len(relation_list)

2308260

In [13]:
relation_list_new, filter_count = remove_duplicates(relation_list)

In [14]:
len(relation_list_new)

1154130

In [15]:
relation_list_new2, filter_count2 = filter_unknowns(relation_list_new)

In [16]:
len(relation_list_new2)

937860

In [17]:
filter_count

1154130

In [18]:
filter_count2

216270

In [19]:
label_map, label_lookup = generate_labels(relation_list_new2)

In [20]:
print(label_map)


{'increase anticoagulant activities': 0, 'therapeutic efficacy increased': 1, 'decrease anticoagulant activities': 2, 'adverse effects increased': 3, 'therapeutic efficacy decreased': 4, 'increase antiplatelet activities': 5, 'decrease effectiveness': 6, 'decrease excretion rate': 7, 'may increase the excretion rate': 8, 'serum concentration decreased': 9, 'serum concentration increased': 10, 'decrease antihypertensive activities': 11, 'increase hypertensive activities': 12, 'increase immunosuppressive activities': 13, 'increase nephrotoxic activities': 14, 'increase neurotoxic activities': 15, 'increase absorption': 16, 'metabolism increased': 17, 'metabolism decreased': 18, 'decrease absorption': 19, 'serum concentration metabolites reduced': 20, 'increase hyperkalemic activities': 21, 'increase vasoconstricting activities': 22, 'increase bradycardic activities': 23, 'increase hepatotoxic activities': 24, 'increase photosensitizing activities': 25, 'increase central nervous system de

In [21]:
print(label_lookup)

{0: 'increase anticoagulant activities', 1: 'therapeutic efficacy increased', 2: 'decrease anticoagulant activities', 3: 'adverse effects increased', 4: 'therapeutic efficacy decreased', 5: 'increase antiplatelet activities', 6: 'decrease effectiveness', 7: 'decrease excretion rate', 8: 'may increase the excretion rate', 9: 'serum concentration decreased', 10: 'serum concentration increased', 11: 'decrease antihypertensive activities', 12: 'increase hypertensive activities', 13: 'increase immunosuppressive activities', 14: 'increase nephrotoxic activities', 15: 'increase neurotoxic activities', 16: 'increase absorption', 17: 'metabolism increased', 18: 'metabolism decreased', 19: 'decrease absorption', 20: 'serum concentration metabolites reduced', 21: 'increase hyperkalemic activities', 22: 'increase vasoconstricting activities', 23: 'increase bradycardic activities', 24: 'increase hepatotoxic activities', 25: 'increase photosensitizing activities', 26: 'increase central nervous syste

In [22]:
relation_list3, filter_count3 = filter_less_frequent_labels_v2(relation_list_new2,300)

In [23]:
len(relation_list3)

936800

In [24]:
print(filter_count3)

1060


In [25]:
for item in relation_list3[0:2]:
    print('********')
    print(item.subject)
    print(item.object)
    print(item.normalized_relation)
    print(item.description)
    print(item.relation)

********
apixaban
bivalirudin
increase anticoagulant activities
apixaban may increase the anticoagulant activities of bivalirudin.
 may increase the anticoagulant activities of
********
dabigatran etexilate
bivalirudin
increase anticoagulant activities
dabigatran etexilate may increase the anticoagulant activities of bivalirudin.
 may increase the anticoagulant activities of


<b> Feature Generation (SMILEs to ECFP)

In [26]:
smiles_feature_list, interaction_label_list, drug_pair_list = featurize_smiles_and_interactions(relation_list3,smiles_to_ECFP,smiles_dict, label_map)

RDKit ERROR: [16:05:16] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [16:05:16] Explicit valence for atom # 2 O, 3, is greater than permitted
RDKit ERROR: [16:05:16] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [16:05:16] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [16:05:16] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [16:05:17] Explicit valence for atom # 0 Mg, 4, is greater than permitted


In [27]:
print(len(smiles_feature_list[0]))

4096


In [28]:
print(interaction_label_list[0:5000])

[0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 0, 4, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 

In [29]:
len(interaction_label_list)

932722

In [30]:
len(smiles_feature_list)

932722

In [39]:
print(smiles_feature_list[0])

[0 0 0 ... 0 0 0]


<b> Modeling (mlp)

In [51]:
X = smiles_feature_list
y = interaction_label_list

In [None]:
X_arr = np.array(X)
y_arr = np.array(y)

In [97]:
X_arr_small = X_arr[0:5000]
y_arr_small = y_arr[0:5000]

In [130]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)

In [131]:
for train_index, test_index in skf.split(X_arr_small, y_arr_small):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_arr_small[train_index], X_arr_small[test_index]
    y_train, y_test = y_arr_small[train_index], y_arr_small[test_index]

TRAIN: [   0    1    3 ... 4997 4998 4999] TEST: [   2   12   14   25   28   30   31   39   45   46   47   50   53   59
   72   81   82   83   84   92   97   99  103  107  110  116  117  126
  130  144  145  148  149  155  157  164  169  175  181  186  187  192
  195  199  202  203  207  210  212  216  217  221  225  233  236  242
  248  253  261  267  277  278  289  297  303  305  309  310  313  314
  321  323  326  327  328  334  344  346  360  362  363  366  368  375
  380  382  390  393  394  402  403  413  415  418  422  428  436  440
  458  459  460  466  472  473  475  478  489  497  505  519  521  523
  525  531  544  546  559  572  579  580  586  593  596  599  603  606
  608  610  613  624  626  629  633  647  649  650  651  656  661  683
  687  694  698  701  710  712  713  715  719  721  725  728  741  746
  747  767  768  770  773  783  787  793  795  796  799  803  810  815
  816  820  842  843  849  852  855  863  864  877  878  879  886  889
  895  897  907  918  924  9



In [132]:
print(len(X_train),len(X_test))

4000 1000


In [133]:
print(len(y_train), len(y_test))

4000 1000


In [137]:
model_rf = rf_train(X_train,y_train)


In [138]:
accuracy, precision, recall, f1 = generate_model_report(model_rf,X_test,y_test)

Accuracy:  0.819
Precision:  0.8211703135806011
Recall:  0.819
F1 Score:  0.8091006450783397


  _warn_prf(average, modifier, msg_start, len(result))


<b> Model Evaluation

In [139]:
model_svm = svm_train(X_train,y_train)

In [140]:
accuracy, precision, recall, f1 = generate_model_report(model_svm,X_test,y_test)

Accuracy:  0.79
Precision:  0.7938886361464229
Recall:  0.79
F1 Score:  0.769488199373114


  _warn_prf(average, modifier, msg_start, len(result))


In [158]:
for item in smiles_dict:
    print(smiles_dict[item])
    break


CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CCC(O)=O)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC(O)=O)NC(=O)CNC(=O)[C@H](CC(N)=O)NC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=O)[C@@H]1CCCN1C(=O)[C@H](N)CC1=CC=CC=C1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(O)=O)C(=O)N[C@@H](CCC(O)=O)C(=O)N[C@@H](CC1=CC=C(O)C=C1)C(=O)N[C@@H](CC(C)C)C(O)=O


In [166]:
process_and_tokenize(smiles_dict[item])

NameError: name 'canonicalize_smiles' is not defined