In [1]:
from parse_and_prepare import ProteinProteinInteractionClassifier as ppi
import prediction as pred
from sklearn.metrics import roc_curve, auc
import pickle



In [2]:
files = ['MEDLINE_FILES',
         'yeast_mentions',
         'yeast_entities.tsv',
         '4932.protein.actions.v10.txt']

In [3]:
dros_strict = pickle.load(open('Results/mouse/strict_pairs_w_sen_df.pkl', 'rb'))
dros_gen = pickle.load(open('Results/mouse/gen_pairs_w_sen_df.pkl', 'rb'))
dros_both_ents = pickle.load(open('Results/mouse/both_ents_w_sen_df.pkl', 'rb'))

dros_strict_real = pickle.load(open('Results/mouse/strict_real.pkl', 'rb'))
dros_gen_real = pickle.load(open('Results/mouse/gen_real.pkl', 'rb'))
dros_be_real = pickle.load(open('Results/mouse/be_real.pkl', 'rb'))

In [4]:
dros_pickle_list = [dros_strict, dros_strict_real,
                    dros_gen, dros_gen_real,
                    dros_both_ents, dros_be_real]

In [5]:
clf = ppi(files, pre_loaded_files=dros_pickle_list, \
          full_sen_set=True)

In [6]:
gen_model = pred.make_w2v_model(clf.gen_real, 'mouse_gen_real')
both_ents_model = pred.make_w2v_model(clf.be_real, 'mouse_both_ents')
strict_list_pure = pred.make_models(clf.strict_real, 'mouse_strict_real')
strict_list_gen_mod = pred.make_models(clf.strict_real,
                                       'mouse_strict_real_gen_mod',
                                       prev_model=gen_model)
strict_list_be_mod = pred.make_models(clf.strict_real,
                                      'mouse_strict_real_be_mod',
                                      prev_model=both_ents_model)

strict_final_list = [strict_list_pure, strict_list_gen_mod, strict_list_be_mod]

print ('\nPredicting\n')
errors = []
fpr = []
tpr = []

for entry in strict_final_list:
    error_bow_norm, probs_bow_norm = pred.XGB_classifier(entry[0], entry[1],
                                                         entry[4], entry[5])
    fpr_bow_norm, tpr_bow_norm, _ = roc_curve(entry[5], probs_bow_norm)
    error_bow_fs, probs_bow_fs = pred.XGB_classifier(entry[0], entry[1],
                                                     entry[4], entry[5],
                                                     feature_selection=True)
    fpr_bow_fs, tpr_bow_fs, _ = roc_curve(entry[5], probs_bow_fs)
    error_w2v_norm, probs_w2v_norm = pred.XGB_classifier(entry[2], entry[3],
                                                         entry[4], entry[5])
    fpr_w2v_norm, tpr_w2v_norm, _ = roc_curve(entry[5], probs_w2v_norm)
    error_w2v_fs, probs_w2v_fs = pred.XGB_classifier(entry[2], entry[3],
                                                     entry[4], entry[5],
                                                     feature_selection=True)
    fpr_w2v_fs, tpr_w2v_fs, _ = roc_curve(entry[5], probs_w2v_fs)

    errors.append([error_bow_norm, error_bow_fs, error_w2v_norm, error_w2v_fs])
    fpr.append([fpr_bow_norm, fpr_bow_fs, fpr_w2v_norm, fpr_w2v_fs])
    tpr.append([tpr_bow_norm, tpr_bow_fs, tpr_w2v_norm, tpr_w2v_fs])

pickle.dump(errors, open('Results/mouse_errors_pickle.pkl', 'wb'))
pickle.dump(fpr, open('Results/mouse_fpr_pickle.pkl', 'wb'))
pickle.dump(tpr, open('Results/mouse_tpr_pickle.pkl', 'wb'))

Parsing datasets sentences


2017-04-18 08:45:56,309 : INFO : collecting all words and their counts
2017-04-18 08:45:56,310 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-18 08:45:56,359 : INFO : PROGRESS: at sentence #10000, processed 259262 words, keeping 15226 word types
2017-04-18 08:45:56,407 : INFO : PROGRESS: at sentence #20000, processed 517364 words, keeping 21166 word types
2017-04-18 08:45:56,464 : INFO : PROGRESS: at sentence #30000, processed 777394 words, keeping 25450 word types


Training Word2Vec Model


2017-04-18 08:45:56,530 : INFO : PROGRESS: at sentence #40000, processed 1036868 words, keeping 29018 word types
2017-04-18 08:45:56,589 : INFO : PROGRESS: at sentence #50000, processed 1294337 words, keeping 31995 word types
2017-04-18 08:45:56,640 : INFO : PROGRESS: at sentence #60000, processed 1551786 words, keeping 34583 word types
2017-04-18 08:45:56,693 : INFO : PROGRESS: at sentence #70000, processed 1808375 words, keeping 36895 word types
2017-04-18 08:45:56,744 : INFO : PROGRESS: at sentence #80000, processed 2067727 words, keeping 39151 word types
2017-04-18 08:45:56,794 : INFO : PROGRESS: at sentence #90000, processed 2325816 words, keeping 41194 word types
2017-04-18 08:45:56,843 : INFO : PROGRESS: at sentence #100000, processed 2586462 words, keeping 43104 word types
2017-04-18 08:45:56,895 : INFO : PROGRESS: at sentence #110000, processed 2844031 words, keeping 44882 word types
2017-04-18 08:45:56,944 : INFO : PROGRESS: at sentence #120000, processed 3102955 words, keepi

Parsing datasets sentences
Training Word2Vec Model


2017-04-18 08:48:15,788 : INFO : collecting all words and their counts
2017-04-18 08:48:18,524 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-18 08:48:18,604 : INFO : PROGRESS: at sentence #10000, processed 238749 words, keeping 15988 word types
2017-04-18 08:48:18,646 : INFO : PROGRESS: at sentence #20000, processed 477557 words, keeping 22103 word types
2017-04-18 08:48:18,687 : INFO : PROGRESS: at sentence #30000, processed 716420 words, keeping 26653 word types
2017-04-18 08:48:18,729 : INFO : PROGRESS: at sentence #40000, processed 957055 words, keeping 30248 word types
2017-04-18 08:48:18,771 : INFO : PROGRESS: at sentence #50000, processed 1197358 words, keeping 33528 word types
2017-04-18 08:48:18,813 : INFO : PROGRESS: at sentence #60000, processed 1435151 words, keeping 36425 word types
2017-04-18 08:48:18,854 : INFO : PROGRESS: at sentence #70000, processed 1673446 words, keeping 39001 word types
2017-04-18 08:48:18,897 : INFO : PROGRESS: 

Parsing datasets sentences


2017-04-18 08:53:48,162 : INFO : collecting all words and their counts
2017-04-18 08:53:48,162 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-18 08:53:48,210 : INFO : PROGRESS: at sentence #10000, processed 269730 words, keeping 16788 word types
2017-04-18 08:53:48,260 : INFO : PROGRESS: at sentence #20000, processed 539647 words, keeping 23198 word types
2017-04-18 08:53:48,311 : INFO : PROGRESS: at sentence #30000, processed 809503 words, keeping 27981 word types


Training Word2Vec Model


2017-04-18 08:53:48,367 : INFO : PROGRESS: at sentence #40000, processed 1079707 words, keeping 31987 word types
2017-04-18 08:53:48,433 : INFO : PROGRESS: at sentence #50000, processed 1349479 words, keeping 35464 word types
2017-04-18 08:53:48,498 : INFO : PROGRESS: at sentence #60000, processed 1620129 words, keeping 38670 word types
2017-04-18 08:53:48,562 : INFO : PROGRESS: at sentence #70000, processed 1891575 words, keeping 41524 word types
2017-04-18 08:53:48,628 : INFO : PROGRESS: at sentence #80000, processed 2160427 words, keeping 44146 word types
2017-04-18 08:53:48,668 : INFO : collected 46216 word types from a corpus of 2368755 raw words and 87723 sentences
2017-04-18 08:53:48,669 : INFO : Loading a fresh vocabulary
2017-04-18 08:53:48,716 : INFO : min_count=5 retains 14788 unique words (31% of original 46216, drops 31428)
2017-04-18 08:53:48,717 : INFO : min_count=5 leaves 2318737 word corpus (97% of original 2368755, drops 50018)
2017-04-18 08:53:48,822 : INFO : deletin

Cleaning and parsing the training set articles

Cleaning and parsing the testing set articles



  feature_vec = np.divide(feature_vec,nwords)


Cleaning and parsing the training set articles

Cleaning and parsing the testing set articles

Cleaning and parsing the training set articles

Cleaning and parsing the testing set articles


Predicting


 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions

 Fitting XGBoost Model!

 Making Predictions


In [7]:
input_list = [[errors, fpr, tpr]]
name_list = ['mouse']
for idx in range(3):
    for item, name in zip(input_list, name_list):
        for fpr_item, tpr_item, error_item in zip(item[1][idx], item[2][idx], item[0][idx]):
            roc_auc = auc(fpr_item, tpr_item)
            auc_val = '%.3f' % roc_auc
            error = '%.3f' % error_item
            legend_label = name + ' auc=' + str(auc_val) + ' error=' + str(error_item)
            print(legend_label)
        print('\n')

mouse auc=0.634 error=0.2322173373915004
mouse auc=0.615 error=0.23289369856836883
mouse auc=0.663 error=0.23368278660804864
mouse auc=0.659 error=0.23796640739488217


mouse auc=0.634 error=0.2322173373915004
mouse auc=0.593 error=0.23345733288242587
mouse auc=0.679 error=0.23266824484274606
mouse auc=0.677 error=0.23435914778491718


mouse auc=0.634 error=0.2322173373915004
mouse auc=0.615 error=0.23244279111712318
mouse auc=0.685 error=0.23075188817495207
mouse auc=0.681 error=0.23086461503776345


