In [1]:
import pickle
import os
import re
import numpy as np
from gensim.models import word2vec
import logging
import pandas as pd

In [2]:
import sys
sys.path.insert(0, '../Core-scripts/')

from parse_and_prepare import ProteinProteinInteractionClassifier as ppi
import file_readers as fr
import prediction as pred



In [3]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

In [4]:
random_seeds = [144, 235, 905, 2895, 3462, 4225, 5056, 5192, 7751, 7813]

In [5]:
def make_w2v_model(dataset, name_for_model, model_features=None):
    """Produce a Word2Vec Model

    Model_features (list): Features of the word to vec models
        1. Word vector dimensionality
        2. Minimum word count
        3. Number of threads to run in parallel
        4. Context window size
        5. Downsample setting for frequent words

    """

    print ('Parsing datasets sentences')

    sentences = [fr.sentence_to_wordlist(sen) for sen in dataset]

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

    # Set values for various parameters
    if model_features:
        num_features = model_features[0] #300  # Word vector dimensionality
        min_word_count = model_features[1] #5  # Minimum word count
        num_workers = model_features[2] #4  # Number of threads to run in parallel
        context = model_features[3] #6  # Context window size
        downsampling = model_features[4] #0.001  # Downsample setting for frequent words
    else:
        num_features = 600  # Word vector dimensionality
        min_word_count = 6  # Minimum word count
        num_workers = 4  # Number of threads to run in parallel
        context = 7  # Context window size
        downsampling = 0.0001  # Downsample setting for frequent words

    print('Training Word2Vec Model')

    model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count=min_word_count, \
            window=context, sample=downsampling)

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=False)

    model_name = 'Results/' + name_for_model + '_model'

    model.save(model_name)

    w2v_model = model

    return w2v_model

In [6]:
# Yeast using old model (w/o skipgramms and hierarchical softmax)

yeast_strict = pickle.load(open('../../Results/Yeast/yeast_mentions_strict_real.pkl', 'rb'))
yeast_gen = pickle.load(open('../../Results/Yeast/yeast_mentions_gen_real.pkl', 'rb'))
yeast_be = pickle.load(open('../../Results/Yeast/yeast_mentions_be_real.pkl', 'rb'))

In [7]:
yeast_strict_model = pred.make_w2v_model(yeast_strict, 'yeast_strict_old')
yeast_gen_model = pred.make_w2v_model(yeast_gen, 'yeast_gen_old')
yeast_be_model = pred.make_w2v_model(yeast_be, 'yeast_be_old')

2017-06-03 18:41:11,700 : INFO : collecting all words and their counts
2017-06-03 18:41:11,701 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-03 18:41:11,747 : INFO : collected 11431 word types from a corpus of 188505 raw words and 7145 sentences
2017-06-03 18:41:11,748 : INFO : Loading a fresh vocabulary
2017-06-03 18:41:11,758 : INFO : min_count=7 retains 2632 unique words (23% of original 11431, drops 8799)
2017-06-03 18:41:11,759 : INFO : min_count=7 leaves 171509 word corpus (90% of original 188505, drops 16996)
2017-06-03 18:41:11,766 : INFO : deleting the raw counts dictionary of 11431 items
2017-06-03 18:41:11,767 : INFO : sample=0.001 downsamples 42 most-common words
2017-06-03 18:41:11,768 : INFO : downsampling leaves estimated 121427 word corpus (70.8% of prior 171509)
2017-06-03 18:41:11,769 : INFO : estimated required memory for 2632 words and 800 dimensions: 18160800 bytes
2017-06-03 18:41:11,775 : INFO : resetting layer weights
2017-06

Parsing datasets sentences
Training Word2Vec Model


2017-06-03 18:41:12,846 : INFO : PROGRESS: at 66.70% examples, 399582 words/s, in_qsize 15, out_qsize 0
2017-06-03 18:41:13,185 : INFO : worker thread finished; awaiting finish of 7 more threads
2017-06-03 18:41:13,202 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-06-03 18:41:13,220 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-06-03 18:41:13,224 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-06-03 18:41:13,237 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-06-03 18:41:13,242 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-06-03 18:41:13,249 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-06-03 18:41:13,255 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-06-03 18:41:13,256 : INFO : training on 942525 raw words (607138 effective words) took 1.4s, 426286 effective words/s
2017-06-03 18:41:13,257 : INFO : precomputing

Parsing datasets sentences


2017-06-03 18:41:13,911 : INFO : collecting all words and their counts
2017-06-03 18:41:13,912 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-03 18:41:13,950 : INFO : PROGRESS: at sentence #10000, processed 242777 words, keeping 11313 word types
2017-06-03 18:41:13,990 : INFO : PROGRESS: at sentence #20000, processed 484644 words, keeping 15048 word types
2017-06-03 18:41:14,032 : INFO : PROGRESS: at sentence #30000, processed 727768 words, keeping 17577 word types
2017-06-03 18:41:14,077 : INFO : PROGRESS: at sentence #40000, processed 970746 words, keeping 19826 word types
2017-06-03 18:41:14,107 : INFO : collected 21147 word types from a corpus of 1155938 raw words and 47662 sentences
2017-06-03 18:41:14,107 : INFO : Loading a fresh vocabulary


Training Word2Vec Model


2017-06-03 18:41:14,127 : INFO : min_count=7 retains 6774 unique words (32% of original 21147, drops 14373)
2017-06-03 18:41:14,128 : INFO : min_count=7 leaves 1127019 word corpus (97% of original 1155938, drops 28919)
2017-06-03 18:41:14,146 : INFO : deleting the raw counts dictionary of 21147 items
2017-06-03 18:41:14,148 : INFO : sample=0.001 downsamples 48 most-common words
2017-06-03 18:41:14,149 : INFO : downsampling leaves estimated 821525 word corpus (72.9% of prior 1127019)
2017-06-03 18:41:14,149 : INFO : estimated required memory for 6774 words and 800 dimensions: 46740600 bytes
2017-06-03 18:41:14,175 : INFO : resetting layer weights
2017-06-03 18:41:14,296 : INFO : training model with 8 workers on 6774 vocabulary and 800 features, using sg=0 hs=0 sample=0.001 negative=5 window=9
2017-06-03 18:41:14,297 : INFO : expecting 47662 sentences, matching count from corpus used for vocabulary survey
2017-06-03 18:41:15,319 : INFO : PROGRESS: at 8.82% examples, 356142 words/s, in_qs

Parsing datasets sentences


2017-06-03 18:41:28,006 : INFO : collecting all words and their counts
2017-06-03 18:41:28,007 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-03 18:41:28,047 : INFO : PROGRESS: at sentence #10000, processed 228608 words, keeping 12333 word types
2017-06-03 18:41:28,087 : INFO : PROGRESS: at sentence #20000, processed 454197 words, keeping 16748 word types
2017-06-03 18:41:28,127 : INFO : PROGRESS: at sentence #30000, processed 685019 words, keeping 20179 word types
2017-06-03 18:41:28,168 : INFO : PROGRESS: at sentence #40000, processed 914521 words, keeping 22936 word types


Training Word2Vec Model


2017-06-03 18:41:28,209 : INFO : PROGRESS: at sentence #50000, processed 1143886 words, keeping 25156 word types
2017-06-03 18:41:28,252 : INFO : PROGRESS: at sentence #60000, processed 1373274 words, keeping 27237 word types
2017-06-03 18:41:28,293 : INFO : PROGRESS: at sentence #70000, processed 1601647 words, keeping 29018 word types
2017-06-03 18:41:28,335 : INFO : PROGRESS: at sentence #80000, processed 1829275 words, keeping 30682 word types
2017-06-03 18:41:28,377 : INFO : PROGRESS: at sentence #90000, processed 2057464 words, keeping 32267 word types
2017-06-03 18:41:28,418 : INFO : PROGRESS: at sentence #100000, processed 2285751 words, keeping 33700 word types
2017-06-03 18:41:28,460 : INFO : PROGRESS: at sentence #110000, processed 2514230 words, keeping 35034 word types
2017-06-03 18:41:28,501 : INFO : PROGRESS: at sentence #120000, processed 2743268 words, keeping 36382 word types
2017-06-03 18:41:28,543 : INFO : PROGRESS: at sentence #130000, processed 2972501 words, keep

In [8]:
import time
name_of_result = 'yeast_OLD'
strict_data = yeast_strict
w2v_strict = yeast_strict_model
w2v_gen = yeast_gen_model
w2v_be = yeast_be_model
xgb_clf = XGBClassifier(seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = pred.XGB_modelfit(xgb_clf, 
                                                                                                                entry[0], 
                                                                                                                entry[2], 
                                                                                                                entry[1], 
                                                                                                                entry[3], 
                                                                                                                model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        
print('Took ', time.time()-start, ' seconds')

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.7177
SR 144 AUC Score (Train): 0.737143
SR 144 Report 
              precision    recall  f1-score   support

          0       0.74      0.83      0.78        75
          1       0.68      0.55      0.61        49

avg / total       0.71      0.72      0.71       124

GEN 144 
Model Report
GEN 144 Accuracy: 0.7984
GEN 144 AUC Score (Train): 0.842721
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.78      0.92      0.85        75
          1       0.83      0.61      0.71        49

avg / total       0.80      0.80      0.79       124

BE 144 
Model Report
BE 144 Accuracy: 0.7661
BE 144 AUC Score (Train): 0.857415
BE 144 Report 
              precision    recall  f1-score   support

          0       0.77      0.88      0.82        75
          1       0.76      0.59      0.67        49

avg / total       0.77      0.77      0.76       124



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 235 
Model Report
SR 235 Accuracy: 0.6667
SR 235 AUC Score (Train): 0.762759
SR 235 Report 
              precision    recall  f1-score   support

          0       0.64      0.86      0.74        29
          1       0.73      0.44      0.55        25

avg / total       0.68      0.67      0.65        54

GEN 235 
Model Report
GEN 235 Accuracy: 0.7037
GEN 235 AUC Score (Train): 0.866207
GEN 235 Report 
              precision    recall  f1-score   support

          0       0.67      0.90      0.76        29
          1       0.80      0.48      0.60        25

avg / total       0.73      0.70      0.69        54

BE 235 
Model Report
BE 235 Accuracy: 0.7407
BE 235 AUC Score (Train): 0.877241
BE 235 Report 
              precision    recall  f1-score   support

          0       0.70      0.90      0.79        29
          1       0.82      0.56      0.67        25

avg / total       0.76      0.74      0.73        54



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 905 
Model Report
SR 905 Accuracy: 0.5972
SR 905 AUC Score (Train): 0.589189
SR 905 Report 
              precision    recall  f1-score   support

          0       0.56      0.86      0.67        35
          1       0.72      0.35      0.47        37

avg / total       0.64      0.60      0.57        72

GEN 905 
Model Report
GEN 905 Accuracy: 0.6528
GEN 905 AUC Score (Train): 0.706564
GEN 905 Report 
              precision    recall  f1-score   support

          0       0.59      0.91      0.72        35
          1       0.83      0.41      0.55        37

avg / total       0.72      0.65      0.63        72

BE 905 
Model Report
BE 905 Accuracy: 0.6111
BE 905 AUC Score (Train): 0.665637
BE 905 Report 
              precision    recall  f1-score   support

          0       0.56      0.91      0.70        35
          1       0.80      0.32      0.46        37

avg / total       0.68      0.61      0.58        72



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 2895 
Model Report
SR 2895 Accuracy: 0.6735
SR 2895 AUC Score (Train): 0.673684
SR 2895 Report 
              precision    recall  f1-score   support

          0       0.72      0.77      0.74        30
          1       0.59      0.53      0.56        19

avg / total       0.67      0.67      0.67        49

GEN 2895 
Model Report
GEN 2895 Accuracy: 0.6531
GEN 2895 AUC Score (Train): 0.740351
GEN 2895 Report 
              precision    recall  f1-score   support

          0       0.68      0.83      0.75        30
          1       0.58      0.37      0.45        19

avg / total       0.64      0.65      0.63        49

BE 2895 
Model Report
BE 2895 Accuracy: 0.6531
BE 2895 AUC Score (Train): 0.773684
BE 2895 Report 
              precision    recall  f1-score   support

          0       0.68      0.83      0.75        30
          1       0.58      0.37      0.45        19

avg / total       0.64      0.65      0.63        49



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 3462 
Model Report
SR 3462 Accuracy: 0.6351
SR 3462 AUC Score (Train): 0.670807
SR 3462 Report 
              precision    recall  f1-score   support

          0       0.67      0.83      0.74        46
          1       0.53      0.32      0.40        28

avg / total       0.61      0.64      0.61        74

GEN 3462 
Model Report
GEN 3462 Accuracy: 0.7162
GEN 3462 AUC Score (Train): 0.784161
GEN 3462 Report 
              precision    recall  f1-score   support

          0       0.71      0.91      0.80        46
          1       0.73      0.39      0.51        28

avg / total       0.72      0.72      0.69        74

BE 3462 
Model Report
BE 3462 Accuracy: 0.6622
BE 3462 AUC Score (Train): 0.769410
BE 3462 Report 
              precision    recall  f1-score   support

          0       0.68      0.87      0.76        46
          1       0.60      0.32      0.42        28

avg / total       0.65      0.66      0.63        74



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 4225 
Model Report
SR 4225 Accuracy: 0.61
SR 4225 AUC Score (Train): 0.723727
SR 4225 Report 
              precision    recall  f1-score   support

          0       0.52      0.90      0.66        42
          1       0.85      0.40      0.54        58

avg / total       0.71      0.61      0.59       100

GEN 4225 
Model Report
GEN 4225 Accuracy: 0.61
GEN 4225 AUC Score (Train): 0.725780
GEN 4225 Report 
              precision    recall  f1-score   support

          0       0.52      0.83      0.64        42
          1       0.79      0.45      0.57        58

avg / total       0.68      0.61      0.60       100

BE 4225 
Model Report
BE 4225 Accuracy: 0.6
BE 4225 AUC Score (Train): 0.735632
BE 4225 Report 
              precision    recall  f1-score   support

          0       0.51      0.83      0.64        42
          1       0.78      0.43      0.56        58

avg / total       0.67      0.60      0.59       100



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5056 
Model Report
SR 5056 Accuracy: 0.6232
SR 5056 AUC Score (Train): 0.633157
SR 5056 Report 
              precision    recall  f1-score   support

          0       0.67      0.74      0.70        42
          1       0.52      0.44      0.48        27

avg / total       0.61      0.62      0.62        69

GEN 5056 
Model Report
GEN 5056 Accuracy: 0.6667
GEN 5056 AUC Score (Train): 0.694885
GEN 5056 Report 
              precision    recall  f1-score   support

          0       0.71      0.76      0.74        42
          1       0.58      0.52      0.55        27

avg / total       0.66      0.67      0.66        69

BE 5056 
Model Report
BE 5056 Accuracy: 0.6957
BE 5056 AUC Score (Train): 0.742504
BE 5056 Report 
              precision    recall  f1-score   support

          0       0.73      0.79      0.76        42
          1       0.62      0.56      0.59        27

avg / total       0.69      0.70      0.69        69



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5192 
Model Report
SR 5192 Accuracy: 0.4848
SR 5192 AUC Score (Train): 0.518269
SR 5192 Report 
              precision    recall  f1-score   support

          0       0.40      0.62      0.48        26
          1       0.62      0.40      0.48        40

avg / total       0.53      0.48      0.48        66

GEN 5192 
Model Report
GEN 5192 Accuracy: 0.6212
GEN 5192 AUC Score (Train): 0.628846
GEN 5192 Report 
              precision    recall  f1-score   support

          0       0.52      0.58      0.55        26
          1       0.70      0.65      0.68        40

avg / total       0.63      0.62      0.62        66

BE 5192 
Model Report
BE 5192 Accuracy: 0.6061
BE 5192 AUC Score (Train): 0.601923
BE 5192 Report 
              precision    recall  f1-score   support

          0       0.50      0.62      0.55        26
          1       0.71      0.60      0.65        40

avg / total       0.62      0.61      0.61        66



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7751 
Model Report
SR 7751 Accuracy: 0.6905
SR 7751 AUC Score (Train): 0.748843
SR 7751 Report 
              precision    recall  f1-score   support

          0       0.70      0.81      0.75        48
          1       0.68      0.53      0.59        36

avg / total       0.69      0.69      0.68        84

GEN 7751 
Model Report
GEN 7751 Accuracy: 0.7619
GEN 7751 AUC Score (Train): 0.757523
GEN 7751 Report 
              precision    recall  f1-score   support

          0       0.74      0.90      0.81        48
          1       0.81      0.58      0.68        36

avg / total       0.77      0.76      0.75        84

BE 7751 
Model Report
BE 7751 Accuracy: 0.7262
BE 7751 AUC Score (Train): 0.815972
BE 7751 Report 
              precision    recall  f1-score   support

          0       0.72      0.85      0.78        48
          1       0.74      0.56      0.63        36

avg / total       0.73      0.73      0.72        84


Predicting

SR 7813 
Model Report
SR 

In [9]:
import time
name_of_result = 'yeast_OLD'
strict_data = yeast_strict
w2v_strict = yeast_strict_model
w2v_gen = yeast_gen_model
w2v_be = yeast_be_model
xgb_clf = XGBClassifier(learning_rate = 0.1,
                        n_estimators = 1000,
                        seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = pred.XGB_modelfit(xgb_clf, 
                                                                                                                entry[0], 
                                                                                                                entry[2], 
                                                                                                                entry[1], 
                                                                                                                entry[3], 
                                                                                                                model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        
print('Took ', time.time()-start, ' seconds')

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.7177
SR 144 AUC Score (Train): 0.740136
SR 144 Report 
              precision    recall  f1-score   support

          0       0.74      0.83      0.78        75
          1       0.68      0.55      0.61        49

avg / total       0.71      0.72      0.71       124

GEN 144 
Model Report
GEN 144 Accuracy: 0.8065
GEN 144 AUC Score (Train): 0.841905
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.78      0.95      0.86        75
          1       0.88      0.59      0.71        49

avg / total       0.82      0.81      0.80       124

BE 144 
Model Report
BE 144 Accuracy: 0.75
BE 144 AUC Score (Train): 0.853061
BE 144 Report 
              precision    recall  f1-score   support

          0       0.76      0.87      0.81        75
          1       0.74      0.57      0.64        49

avg / total       0.75      0.75      0.74       124



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 235 
Model Report
SR 235 Accuracy: 0.5556
SR 235 AUC Score (Train): 0.742069
SR 235 Report 
              precision    recall  f1-score   support

          0       0.56      0.79      0.66        29
          1       0.54      0.28      0.37        25

avg / total       0.55      0.56      0.52        54

GEN 235 
Model Report
GEN 235 Accuracy: 0.7222
GEN 235 AUC Score (Train): 0.888276
GEN 235 Report 
              precision    recall  f1-score   support

          0       0.68      0.90      0.78        29
          1       0.81      0.52      0.63        25

avg / total       0.74      0.72      0.71        54

BE 235 
Model Report
BE 235 Accuracy: 0.7407
BE 235 AUC Score (Train): 0.888276
BE 235 Report 
              precision    recall  f1-score   support

          0       0.70      0.90      0.79        29
          1       0.82      0.56      0.67        25

avg / total       0.76      0.74      0.73        54



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 905 
Model Report
SR 905 Accuracy: 0.6111
SR 905 AUC Score (Train): 0.592278
SR 905 Report 
              precision    recall  f1-score   support

          0       0.57      0.86      0.68        35
          1       0.74      0.38      0.50        37

avg / total       0.65      0.61      0.59        72

GEN 905 
Model Report
GEN 905 Accuracy: 0.625
GEN 905 AUC Score (Train): 0.707336
GEN 905 Report 
              precision    recall  f1-score   support

          0       0.57      0.89      0.70        35
          1       0.78      0.38      0.51        37

avg / total       0.68      0.62      0.60        72

BE 905 
Model Report
BE 905 Accuracy: 0.6111
BE 905 AUC Score (Train): 0.652510
BE 905 Report 
              precision    recall  f1-score   support

          0       0.56      0.91      0.70        35
          1       0.80      0.32      0.46        37

avg / total       0.68      0.61      0.58        72



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 2895 
Model Report
SR 2895 Accuracy: 0.6531
SR 2895 AUC Score (Train): 0.670175
SR 2895 Report 
              precision    recall  f1-score   support

          0       0.69      0.80      0.74        30
          1       0.57      0.42      0.48        19

avg / total       0.64      0.65      0.64        49

GEN 2895 
Model Report
GEN 2895 Accuracy: 0.6531
GEN 2895 AUC Score (Train): 0.698246
GEN 2895 Report 
              precision    recall  f1-score   support

          0       0.68      0.83      0.75        30
          1       0.58      0.37      0.45        19

avg / total       0.64      0.65      0.63        49

BE 2895 
Model Report
BE 2895 Accuracy: 0.6735
BE 2895 AUC Score (Train): 0.785965
BE 2895 Report 
              precision    recall  f1-score   support

          0       0.69      0.83      0.76        30
          1       0.62      0.42      0.50        19

avg / total       0.66      0.67      0.66        49



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 3462 
Model Report
SR 3462 Accuracy: 0.6216
SR 3462 AUC Score (Train): 0.645963
SR 3462 Report 
              precision    recall  f1-score   support

          0       0.67      0.78      0.72        46
          1       0.50      0.36      0.42        28

avg / total       0.60      0.62      0.61        74

GEN 3462 
Model Report
GEN 3462 Accuracy: 0.7297
GEN 3462 AUC Score (Train): 0.763975
GEN 3462 Report 
              precision    recall  f1-score   support

          0       0.72      0.91      0.81        46
          1       0.75      0.43      0.55        28

avg / total       0.73      0.73      0.71        74

BE 3462 
Model Report
BE 3462 Accuracy: 0.6892
BE 3462 AUC Score (Train): 0.762422
BE 3462 Report 
              precision    recall  f1-score   support

          0       0.70      0.87      0.78        46
          1       0.65      0.39      0.49        28

avg / total       0.68      0.69      0.67        74



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 4225 
Model Report
SR 4225 Accuracy: 0.61
SR 4225 AUC Score (Train): 0.719212
SR 4225 Report 
              precision    recall  f1-score   support

          0       0.52      0.90      0.66        42
          1       0.85      0.40      0.54        58

avg / total       0.71      0.61      0.59       100

GEN 4225 
Model Report
GEN 4225 Accuracy: 0.6
GEN 4225 AUC Score (Train): 0.736453
GEN 4225 Report 
              precision    recall  f1-score   support

          0       0.51      0.83      0.64        42
          1       0.78      0.43      0.56        58

avg / total       0.67      0.60      0.59       100

BE 4225 
Model Report
BE 4225 Accuracy: 0.59
BE 4225 AUC Score (Train): 0.738916
BE 4225 Report 
              precision    recall  f1-score   support

          0       0.51      0.83      0.63        42
          1       0.77      0.41      0.54        58

avg / total       0.66      0.59      0.58       100



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5056 
Model Report
SR 5056 Accuracy: 0.5652
SR 5056 AUC Score (Train): 0.636684
SR 5056 Report 
              precision    recall  f1-score   support

          0       0.62      0.71      0.67        42
          1       0.43      0.33      0.38        27

avg / total       0.55      0.57      0.55        69

GEN 5056 
Model Report
GEN 5056 Accuracy: 0.6522
GEN 5056 AUC Score (Train): 0.689594
GEN 5056 Report 
              precision    recall  f1-score   support

          0       0.71      0.71      0.71        42
          1       0.56      0.56      0.56        27

avg / total       0.65      0.65      0.65        69

BE 5056 
Model Report
BE 5056 Accuracy: 0.6957
BE 5056 AUC Score (Train): 0.744268
BE 5056 Report 
              precision    recall  f1-score   support

          0       0.74      0.76      0.75        42
          1       0.62      0.59      0.60        27

avg / total       0.69      0.70      0.69        69



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5192 
Model Report
SR 5192 Accuracy: 0.5152
SR 5192 AUC Score (Train): 0.527885
SR 5192 Report 
              precision    recall  f1-score   support

          0       0.42      0.62      0.50        26
          1       0.64      0.45      0.53        40

avg / total       0.56      0.52      0.52        66

GEN 5192 
Model Report
GEN 5192 Accuracy: 0.6061
GEN 5192 AUC Score (Train): 0.625962
GEN 5192 Report 
              precision    recall  f1-score   support

          0       0.50      0.58      0.54        26
          1       0.69      0.62      0.66        40

avg / total       0.62      0.61      0.61        66

BE 5192 
Model Report
BE 5192 Accuracy: 0.5606
BE 5192 AUC Score (Train): 0.623077
BE 5192 Report 
              precision    recall  f1-score   support

          0       0.46      0.62      0.52        26
          1       0.68      0.53      0.59        40

avg / total       0.59      0.56      0.57        66



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7751 
Model Report
SR 7751 Accuracy: 0.6905
SR 7751 AUC Score (Train): 0.737847
SR 7751 Report 
              precision    recall  f1-score   support

          0       0.70      0.81      0.75        48
          1       0.68      0.53      0.59        36

avg / total       0.69      0.69      0.68        84

GEN 7751 
Model Report
GEN 7751 Accuracy: 0.7857
GEN 7751 AUC Score (Train): 0.754630
GEN 7751 Report 
              precision    recall  f1-score   support

          0       0.77      0.90      0.83        48
          1       0.82      0.64      0.72        36

avg / total       0.79      0.79      0.78        84

BE 7751 
Model Report
BE 7751 Accuracy: 0.75
BE 7751 AUC Score (Train): 0.816551
BE 7751 Report 
              precision    recall  f1-score   support

          0       0.72      0.92      0.81        48
          1       0.83      0.53      0.64        36

avg / total       0.77      0.75      0.74        84


Predicting

SR 7813 
Model Report
SR 78

In [10]:
import time
name_of_result = 'yeast_OLD'
strict_data = yeast_strict
w2v_strict = yeast_strict_model
w2v_gen = yeast_gen_model
w2v_be = yeast_be_model
xgb_clf = XGBClassifier(learning_rate = 0.01,
                        n_estimators = 5000,
                        seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = pred.XGB_modelfit(xgb_clf, 
                                                                                                                entry[0], 
                                                                                                                entry[2], 
                                                                                                                entry[1], 
                                                                                                                entry[3], 
                                                                                                                model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        
print('Took ', time.time()-start, ' seconds')

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.7419
SR 144 AUC Score (Train): 0.740680
SR 144 Report 
              precision    recall  f1-score   support

          0       0.74      0.88      0.80        75
          1       0.74      0.53      0.62        49

avg / total       0.74      0.74      0.73       124

GEN 144 
Model Report
GEN 144 Accuracy: 0.8145
GEN 144 AUC Score (Train): 0.858231
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.80      0.92      0.86        75
          1       0.84      0.65      0.74        49

avg / total       0.82      0.81      0.81       124

BE 144 
Model Report
BE 144 Accuracy: 0.8065
BE 144 AUC Score (Train): 0.878095
BE 144 Report 
              precision    recall  f1-score   support

          0       0.81      0.89      0.85        75
          1       0.80      0.67      0.73        49

avg / total       0.81      0.81      0.80       124



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 235 
Model Report
SR 235 Accuracy: 0.5741
SR 235 AUC Score (Train): 0.746207
SR 235 Report 
              precision    recall  f1-score   support

          0       0.58      0.76      0.66        29
          1       0.56      0.36      0.44        25

avg / total       0.57      0.57      0.56        54

GEN 235 
Model Report
GEN 235 Accuracy: 0.7222
GEN 235 AUC Score (Train): 0.884138
GEN 235 Report 
              precision    recall  f1-score   support

          0       0.68      0.90      0.78        29
          1       0.81      0.52      0.63        25

avg / total       0.74      0.72      0.71        54

BE 235 
Model Report
BE 235 Accuracy: 0.7407
BE 235 AUC Score (Train): 0.895172
BE 235 Report 
              precision    recall  f1-score   support

          0       0.70      0.90      0.79        29
          1       0.82      0.56      0.67        25

avg / total       0.76      0.74      0.73        54



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 905 
Model Report
SR 905 Accuracy: 0.5556
SR 905 AUC Score (Train): 0.619305
SR 905 Report 
              precision    recall  f1-score   support

          0       0.53      0.80      0.64        35
          1       0.63      0.32      0.43        37

avg / total       0.58      0.56      0.53        72



KeyboardInterrupt: 

In [36]:
name_of_result = 'yeast_NEW'
strict_data = yeast_strict
w2v_strict = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_strict_w2v_model')
w2v_gen = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_gen_w2v_model')
w2v_be = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_be_w2v_model')
xgb_clf = XGBClassifier(learning_rate=0.1,
                        n_estimators=1000,
                        max_depth=6,
                        min_child_weight=1,
                        gamma=0.2,
                        subsample=0.6,
                        colsample_bytree=0.8,
                        reg_alpha=0.01,
                        objective='binary:logistic',
                        scale_pos_weight=1,
                        seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = modelfit(xgb_clf, 
                                                                                                       entry[0], 
                                                                                                       entry[2], 
                                                                                                       entry[1], 
                                                                                                       entry[3], 
                                                                                                       model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
print('Took ', time.time()-start, ' seconds')

2017-06-02 13:33:46,449 : INFO : loading Word2Vec object from ../../Results/Yeast/models/yeast_strict_w2v_model
2017-06-02 13:33:46,585 : INFO : loading wv recursively from ../../Results/Yeast/models/yeast_strict_w2v_model.wv.* with mmap=None
2017-06-02 13:33:46,585 : INFO : setting ignored attribute cum_table to None
2017-06-02 13:33:46,586 : INFO : setting ignored attribute syn0norm to None
2017-06-02 13:33:46,587 : INFO : loaded ../../Results/Yeast/models/yeast_strict_w2v_model
2017-06-02 13:33:46,592 : INFO : loading Word2Vec object from ../../Results/Yeast/models/yeast_gen_w2v_model
2017-06-02 13:33:46,937 : INFO : loading wv recursively from ../../Results/Yeast/models/yeast_gen_w2v_model.wv.* with mmap=None
2017-06-02 13:33:46,938 : INFO : setting ignored attribute syn0norm to None
2017-06-02 13:33:46,939 : INFO : setting ignored attribute cum_table to None
2017-06-02 13:33:46,939 : INFO : loaded ../../Results/Yeast/models/yeast_gen_w2v_model
2017-06-02 13:33:46,952 : INFO : load


Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.6094
SR 144 AUC Score (Train): 0.637638
SR 144 Report 
              precision    recall  f1-score   support

          0       0.64      0.73      0.68        37
          1       0.55      0.44      0.49        27

avg / total       0.60      0.61      0.60        64

GEN 144 
Model Report
GEN 144 Accuracy: 0.6719
GEN 144 AUC Score (Train): 0.684685
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.66      0.89      0.76        37
          1       0.71      0.37      0.49        27

avg / total       0.68      0.67      0.64        64

BE 144 
Model Report
BE 144 Accuracy: 0.6562
BE 144 AUC Score (Train): 0.727728
BE 144 Report 
              precision    recall  f1-score   support

          0       0.65      0.89      0.75        37
          1       0.69      0.33      0.45        27

avg / total       0.67      0.66      0.62        64



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 235 
Model Report
SR 235 Accuracy: 0.7101
SR 235 AUC Score (Train): 0.735993
SR 235 Report 
              precision    recall  f1-score   support

          0       0.71      0.79      0.75        38
          1       0.70      0.61      0.66        31

avg / total       0.71      0.71      0.71        69

GEN 235 
Model Report
GEN 235 Accuracy: 0.6957
GEN 235 AUC Score (Train): 0.723260
GEN 235 Report 
              precision    recall  f1-score   support

          0       0.71      0.76      0.73        38
          1       0.68      0.61      0.64        31

avg / total       0.69      0.70      0.69        69

BE 235 
Model Report
BE 235 Accuracy: 0.6812
BE 235 AUC Score (Train): 0.721562
BE 235 Report 
              precision    recall  f1-score   support

          0       0.69      0.76      0.72        38
          1       0.67      0.58      0.62        31

avg / total       0.68      0.68      0.68        69


Predicting

SR 905 
Model Report
SR 905 Accuracy:

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 2895 
Model Report
SR 2895 Accuracy: 0.6739
SR 2895 AUC Score (Train): 0.760096
SR 2895 Report 
              precision    recall  f1-score   support

          0       0.69      0.77      0.73        52
          1       0.65      0.55      0.59        40

avg / total       0.67      0.67      0.67        92

GEN 2895 
Model Report
GEN 2895 Accuracy: 0.6957
GEN 2895 AUC Score (Train): 0.724519
GEN 2895 Report 
              precision    recall  f1-score   support

          0       0.69      0.83      0.75        52
          1       0.70      0.53      0.60        40

avg / total       0.70      0.70      0.69        92

BE 2895 
Model Report
BE 2895 Accuracy: 0.6522
BE 2895 AUC Score (Train): 0.716827
BE 2895 Report 
              precision    recall  f1-score   support

          0       0.64      0.87      0.74        52
          1       0.68      0.38      0.48        40

avg / total       0.66      0.65      0.63        92



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 3462 
Model Report
SR 3462 Accuracy: 0.6753
SR 3462 AUC Score (Train): 0.729420
SR 3462 Report 
              precision    recall  f1-score   support

          0       0.68      0.67      0.68        39
          1       0.67      0.68      0.68        38

avg / total       0.68      0.68      0.68        77

GEN 3462 
Model Report
GEN 3462 Accuracy: 0.7143
GEN 3462 AUC Score (Train): 0.731444
GEN 3462 Report 
              precision    recall  f1-score   support

          0       0.72      0.72      0.72        39
          1       0.71      0.71      0.71        38

avg / total       0.71      0.71      0.71        77

BE 3462 
Model Report
BE 3462 Accuracy: 0.7013
BE 3462 AUC Score (Train): 0.767881
BE 3462 Report 
              precision    recall  f1-score   support

          0       0.72      0.67      0.69        39
          1       0.68      0.74      0.71        38

avg / total       0.70      0.70      0.70        77


Predicting

SR 4225 
Model Report
SR 

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5056 
Model Report
SR 5056 Accuracy: 0.8235
SR 5056 AUC Score (Train): 0.867424
SR 5056 Report 
              precision    recall  f1-score   support

          0       0.90      0.82      0.86        22
          1       0.71      0.83      0.77        12

avg / total       0.83      0.82      0.83        34

GEN 5056 
Model Report
GEN 5056 Accuracy: 0.7353
GEN 5056 AUC Score (Train): 0.829545
GEN 5056 Report 
              precision    recall  f1-score   support

          0       0.84      0.73      0.78        22
          1       0.60      0.75      0.67        12

avg / total       0.76      0.74      0.74        34

BE 5056 
Model Report
BE 5056 Accuracy: 0.7941
BE 5056 AUC Score (Train): 0.818182
BE 5056 Report 
              precision    recall  f1-score   support

          0       0.89      0.77      0.83        22
          1       0.67      0.83      0.74        12

avg / total       0.81      0.79      0.80        34



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5192 
Model Report
SR 5192 Accuracy: 0.6582
SR 5192 AUC Score (Train): 0.759973
SR 5192 Report 
              precision    recall  f1-score   support

          0       0.55      0.84      0.67        32
          1       0.83      0.53      0.65        47

avg / total       0.72      0.66      0.66        79

GEN 5192 
Model Report
GEN 5192 Accuracy: 0.6456
GEN 5192 AUC Score (Train): 0.767287
GEN 5192 Report 
              precision    recall  f1-score   support

          0       0.54      0.78      0.64        32
          1       0.79      0.55      0.65        47

avg / total       0.69      0.65      0.65        79

BE 5192 
Model Report
BE 5192 Accuracy: 0.6709
BE 5192 AUC Score (Train): 0.754654
BE 5192 Report 
              precision    recall  f1-score   support

          0       0.56      0.88      0.68        32
          1       0.86      0.53      0.66        47

avg / total       0.74      0.67      0.67        79



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7751 
Model Report
SR 7751 Accuracy: 0.6
SR 7751 AUC Score (Train): 0.605600
SR 7751 Report 
              precision    recall  f1-score   support

          0       0.78      0.56      0.65        50
          1       0.44      0.68      0.53        25

avg / total       0.66      0.60      0.61        75

GEN 7751 
Model Report
GEN 7751 Accuracy: 0.56
GEN 7751 AUC Score (Train): 0.694400
GEN 7751 Report 
              precision    recall  f1-score   support

          0       0.76      0.50      0.60        50
          1       0.40      0.68      0.51        25

avg / total       0.64      0.56      0.57        75

BE 7751 
Model Report
BE 7751 Accuracy: 0.6267
BE 7751 AUC Score (Train): 0.685600
BE 7751 Report 
              precision    recall  f1-score   support

          0       0.81      0.58      0.67        50
          1       0.46      0.72      0.56        25

avg / total       0.69      0.63      0.64        75



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7813 
Model Report
SR 7813 Accuracy: 0.6234
SR 7813 AUC Score (Train): 0.688098
SR 7813 Report 
              precision    recall  f1-score   support

          0       0.56      0.65      0.60        34
          1       0.68      0.60      0.64        43

avg / total       0.63      0.62      0.62        77

GEN 7813 
Model Report
GEN 7813 Accuracy: 0.7273
GEN 7813 AUC Score (Train): 0.756498
GEN 7813 Report 
              precision    recall  f1-score   support

          0       0.67      0.76      0.71        34
          1       0.79      0.70      0.74        43

avg / total       0.74      0.73      0.73        77

BE 7813 
Model Report
BE 7813 Accuracy: 0.6883
BE 7813 AUC Score (Train): 0.720246
BE 7813 Report 
              precision    recall  f1-score   support

          0       0.67      0.59      0.62        34
          1       0.70      0.77      0.73        43

avg / total       0.69      0.69      0.69        77

Took  1404.1214997768402  seconds


In [37]:
name_of_result = 'yeast_NEW_w2v_OLD_xgb'
strict_data = yeast_strict
w2v_strict = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_strict_w2v_model')
w2v_gen = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_gen_w2v_model')
w2v_be = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_be_w2v_model')
xgb_clf = XGBClassifier(seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = modelfit(xgb_clf, 
                                                                                                       entry[0], 
                                                                                                       entry[2], 
                                                                                                       entry[1], 
                                                                                                       entry[3], 
                                                                                                       model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
print('Took ', time.time()-start, ' seconds')

2017-06-02 13:57:11,912 : INFO : loading Word2Vec object from ../../Results/Yeast/models/yeast_strict_w2v_model
2017-06-02 13:57:12,086 : INFO : loading wv recursively from ../../Results/Yeast/models/yeast_strict_w2v_model.wv.* with mmap=None
2017-06-02 13:57:12,087 : INFO : setting ignored attribute cum_table to None
2017-06-02 13:57:12,088 : INFO : setting ignored attribute syn0norm to None
2017-06-02 13:57:12,089 : INFO : loaded ../../Results/Yeast/models/yeast_strict_w2v_model
2017-06-02 13:57:12,106 : INFO : loading Word2Vec object from ../../Results/Yeast/models/yeast_gen_w2v_model
2017-06-02 13:57:12,559 : INFO : loading wv recursively from ../../Results/Yeast/models/yeast_gen_w2v_model.wv.* with mmap=None
2017-06-02 13:57:12,561 : INFO : setting ignored attribute syn0norm to None
2017-06-02 13:57:12,561 : INFO : setting ignored attribute cum_table to None
2017-06-02 13:57:12,562 : INFO : loaded ../../Results/Yeast/models/yeast_gen_w2v_model
2017-06-02 13:57:12,582 : INFO : load


Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.625
SR 144 AUC Score (Train): 0.691692
SR 144 Report 
              precision    recall  f1-score   support

          0       0.65      0.76      0.70        37
          1       0.57      0.44      0.50        27

avg / total       0.62      0.62      0.62        64

GEN 144 
Model Report
GEN 144 Accuracy: 0.7188
GEN 144 AUC Score (Train): 0.731732
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.69      0.92      0.79        37
          1       0.80      0.44      0.57        27

avg / total       0.74      0.72      0.70        64

BE 144 
Model Report
BE 144 Accuracy: 0.6406
BE 144 AUC Score (Train): 0.724725
BE 144 Report 
              precision    recall  f1-score   support

          0       0.65      0.84      0.73        37
          1       0.62      0.37      0.47        27

avg / total       0.64      0.64      0.62        64



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 235 
Model Report
SR 235 Accuracy: 0.6812
SR 235 AUC Score (Train): 0.717317
SR 235 Report 
              precision    recall  f1-score   support

          0       0.70      0.74      0.72        38
          1       0.66      0.61      0.63        31

avg / total       0.68      0.68      0.68        69

GEN 235 
Model Report
GEN 235 Accuracy: 0.7246
GEN 235 AUC Score (Train): 0.719015
GEN 235 Report 
              precision    recall  f1-score   support

          0       0.72      0.82      0.77        38
          1       0.73      0.61      0.67        31

avg / total       0.73      0.72      0.72        69

BE 235 
Model Report
BE 235 Accuracy: 0.6522
BE 235 AUC Score (Train): 0.729202
BE 235 Report 
              precision    recall  f1-score   support

          0       0.68      0.71      0.69        38
          1       0.62      0.58      0.60        31

avg / total       0.65      0.65      0.65        69


Predicting

SR 905 
Model Report
SR 905 Accuracy:

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 2895 
Model Report
SR 2895 Accuracy: 0.6087
SR 2895 AUC Score (Train): 0.754327
SR 2895 Report 
              precision    recall  f1-score   support

          0       0.62      0.77      0.69        52
          1       0.57      0.40      0.47        40

avg / total       0.60      0.61      0.59        92

GEN 2895 
Model Report
GEN 2895 Accuracy: 0.6522
GEN 2895 AUC Score (Train): 0.710577
GEN 2895 Report 
              precision    recall  f1-score   support

          0       0.67      0.77      0.71        52
          1       0.62      0.50      0.56        40

avg / total       0.65      0.65      0.65        92

BE 2895 
Model Report
BE 2895 Accuracy: 0.6848
BE 2895 AUC Score (Train): 0.758173
BE 2895 Report 
              precision    recall  f1-score   support

          0       0.67      0.87      0.76        52
          1       0.72      0.45      0.55        40

avg / total       0.69      0.68      0.67        92



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 3462 
Model Report
SR 3462 Accuracy: 0.7013
SR 3462 AUC Score (Train): 0.717949
SR 3462 Report 
              precision    recall  f1-score   support

          0       0.71      0.69      0.70        39
          1       0.69      0.71      0.70        38

avg / total       0.70      0.70      0.70        77

GEN 3462 
Model Report
GEN 3462 Accuracy: 0.7532
GEN 3462 AUC Score (Train): 0.763158
GEN 3462 Report 
              precision    recall  f1-score   support

          0       0.74      0.79      0.77        39
          1       0.77      0.71      0.74        38

avg / total       0.75      0.75      0.75        77

BE 3462 
Model Report
BE 3462 Accuracy: 0.7013
BE 3462 AUC Score (Train): 0.804993
BE 3462 Report 
              precision    recall  f1-score   support

          0       0.74      0.64      0.68        39
          1       0.67      0.76      0.72        38

avg / total       0.71      0.70      0.70        77


Predicting

SR 4225 
Model Report
SR 

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5056 
Model Report
SR 5056 Accuracy: 0.8235
SR 5056 AUC Score (Train): 0.863636
SR 5056 Report 
              precision    recall  f1-score   support

          0       0.90      0.82      0.86        22
          1       0.71      0.83      0.77        12

avg / total       0.83      0.82      0.83        34

GEN 5056 
Model Report
GEN 5056 Accuracy: 0.7647
GEN 5056 AUC Score (Train): 0.814394
GEN 5056 Report 
              precision    recall  f1-score   support

          0       0.85      0.77      0.81        22
          1       0.64      0.75      0.69        12

avg / total       0.78      0.76      0.77        34

BE 5056 
Model Report
BE 5056 Accuracy: 0.7647
BE 5056 AUC Score (Train): 0.863636
BE 5056 Report 
              precision    recall  f1-score   support

          0       0.85      0.77      0.81        22
          1       0.64      0.75      0.69        12

avg / total       0.78      0.76      0.77        34



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5192 
Model Report
SR 5192 Accuracy: 0.6962
SR 5192 AUC Score (Train): 0.789229
SR 5192 Report 
              precision    recall  f1-score   support

          0       0.58      0.91      0.71        32
          1       0.90      0.55      0.68        47

avg / total       0.77      0.70      0.69        79

GEN 5192 
Model Report
GEN 5192 Accuracy: 0.6835
GEN 5192 AUC Score (Train): 0.767952
GEN 5192 Report 
              precision    recall  f1-score   support

          0       0.57      0.91      0.70        32
          1       0.89      0.53      0.67        47

avg / total       0.76      0.68      0.68        79

BE 5192 
Model Report
BE 5192 Accuracy: 0.7468
BE 5192 AUC Score (Train): 0.815160
BE 5192 Report 
              precision    recall  f1-score   support

          0       0.64      0.88      0.74        32
          1       0.89      0.66      0.76        47

avg / total       0.78      0.75      0.75        79



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7751 
Model Report
SR 7751 Accuracy: 0.64
SR 7751 AUC Score (Train): 0.672800
SR 7751 Report 
              precision    recall  f1-score   support

          0       0.79      0.62      0.70        50
          1       0.47      0.68      0.56        25

avg / total       0.69      0.64      0.65        75

GEN 7751 
Model Report
GEN 7751 Accuracy: 0.6
GEN 7751 AUC Score (Train): 0.663200
GEN 7751 Report 
              precision    recall  f1-score   support

          0       0.76      0.58      0.66        50
          1       0.43      0.64      0.52        25

avg / total       0.65      0.60      0.61        75

BE 7751 
Model Report
BE 7751 Accuracy: 0.6267
BE 7751 AUC Score (Train): 0.670400
BE 7751 Report 
              precision    recall  f1-score   support

          0       0.78      0.62      0.69        50
          1       0.46      0.64      0.53        25

avg / total       0.67      0.63      0.64        75



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7813 
Model Report
SR 7813 Accuracy: 0.6623
SR 7813 AUC Score (Train): 0.704514
SR 7813 Report 
              precision    recall  f1-score   support

          0       0.61      0.65      0.63        34
          1       0.71      0.67      0.69        43

avg / total       0.66      0.66      0.66        77

GEN 7813 
Model Report
GEN 7813 Accuracy: 0.6623
GEN 7813 AUC Score (Train): 0.760602
GEN 7813 Report 
              precision    recall  f1-score   support

          0       0.61      0.65      0.63        34
          1       0.71      0.67      0.69        43

avg / total       0.66      0.66      0.66        77

BE 7813 
Model Report
BE 7813 Accuracy: 0.6494
BE 7813 AUC Score (Train): 0.751026
BE 7813 Report 
              precision    recall  f1-score   support

          0       0.60      0.62      0.61        34
          1       0.69      0.67      0.68        43

avg / total       0.65      0.65      0.65        77

Took  1503.6061532497406  seconds


In [12]:
name_of_result = 'yeast_OLD_w2v_NEW_XGB'
strict_data = yeast_strict
w2v_strict = yeast_strict_model
w2v_gen = yeast_gen_model
w2v_be = yeast_be_model
xgb_clf = XGBClassifier(learning_rate=0.1,
                        n_estimators=1000,
                        max_depth=6,
                        min_child_weight=1,
                        gamma=0.2,
                        subsample=0.6,
                        colsample_bytree=0.8,
                        reg_alpha=0.01,
                        objective='binary:logistic',
                        scale_pos_weight=1,
                        seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = modelfit(xgb_clf, 
                                                                                                       entry[0], 
                                                                                                       entry[2], 
                                                                                                       entry[1], 
                                                                                                       entry[3], 
                                                                                                       model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
print('Took ', time.time()-start, ' seconds')


Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.4744
SR 144 AUC Score (Train): 0.537542
SR 144 Report 
              precision    recall  f1-score   support

          0       0.45      0.71      0.55        35
          1       0.55      0.28      0.37        43

avg / total       0.50      0.47      0.45        78

GEN 144 
Model Report
GEN 144 Accuracy: 0.6154
GEN 144 AUC Score (Train): 0.657143
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.54      0.89      0.67        35
          1       0.81      0.40      0.53        43

avg / total       0.69      0.62      0.60        78

BE 144 
Model Report
BE 144 Accuracy: 0.6667
BE 144 AUC Score (Train): 0.714286
BE 144 Report 
              precision    recall  f1-score   support

          0       0.59      0.86      0.70        35
          1       0.81      0.51      0.63        43

avg / total       0.71      0.67      0.66        78


Predicting



KeyboardInterrupt: 