In [1]:
import pickle
import os
import re
import numpy as np
from gensim.models import word2vec
import logging
import pandas as pd

In [2]:
import sys
sys.path.insert(0, '../Core-scripts/')

from parse_and_prepare import ProteinProteinInteractionClassifier as ppi
import file_readers as fr
import prediction as pred



In [3]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

In [4]:
random_seeds = [144, 235, 905, 2895, 3462, 4225, 5056, 5192, 7751, 7813]

In [17]:
def make_w2v_model(dataset, name_for_model, model_features=None):
    """Produce a Word2Vec Model

    Model_features (list): Features of the word to vec models
        1. Word vector dimensionality
        2. Minimum word count
        3. Number of threads to run in parallel
        4. Context window size
        5. Downsample setting for frequent words

    """

    print ('Parsing datasets sentences')

    sentences = [fr.sentence_to_wordlist(sen) for sen in dataset]

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

    # Set values for various parameters
    if model_features:
        num_features = model_features[0] #300  # Word vector dimensionality
        min_word_count = model_features[1] #5  # Minimum word count
        num_workers = model_features[2] #4  # Number of threads to run in parallel
        context = model_features[3] #6  # Context window size
        downsampling = model_features[4] #0.001  # Downsample setting for frequent words
    else:
        num_features = 600  # Word vector dimensionality
        min_word_count = 6  # Minimum word count
        num_workers = 4  # Number of threads to run in parallel
        context = 7  # Context window size
        downsampling = 0.0001  # Downsample setting for frequent words

    print('Training Word2Vec Model')

    model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count=min_word_count, \
            window=context, sample=downsampling)

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=False)

    model_name = 'Results/' + name_for_model + '_model'

    model.save(model_name)

    w2v_model = model

    return w2v_model

In [6]:
# Yeast using old model (w/o skipgramms and hierarchical softmax)

yeast_strict = pickle.load(open('../../Results/Yeast/yeast_mentions_strict_real.pkl', 'rb'))
yeast_gen = pickle.load(open('../../Results/Yeast/yeast_mentions_gen_real.pkl', 'rb'))
yeast_be = pickle.load(open('../../Results/Yeast/yeast_mentions_be_real.pkl', 'rb'))

In [14]:
yeast_strict_model = make_w2v_model(yeast_strict, 'yeast_strict_kkas')
yeast_gen_model = make_w2v_model(yeast_gen, 'yeast_gen_kkas')
yeast_be_model = make_w2v_model(yeast_be, 'yeast_be_kkas')

2017-06-02 17:09:36,218 : INFO : collecting all words and their counts
2017-06-02 17:09:36,219 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:09:36,252 : INFO : collected 11431 word types from a corpus of 188505 raw words and 7145 sentences
2017-06-02 17:09:36,253 : INFO : Loading a fresh vocabulary
2017-06-02 17:09:36,271 : INFO : min_count=5 retains 3346 unique words (29% of original 11431, drops 8085)
2017-06-02 17:09:36,272 : INFO : min_count=5 leaves 175425 word corpus (93% of original 188505, drops 13080)
2017-06-02 17:09:36,281 : INFO : deleting the raw counts dictionary of 11431 items
2017-06-02 17:09:36,282 : INFO : sample=0.001 downsamples 41 most-common words
2017-06-02 17:09:36,282 : INFO : downsampling leaves estimated 125739 word corpus (71.7% of prior 175425)
2017-06-02 17:09:36,283 : INFO : estimated required memory for 3346 words and 100 dimensions: 4349800 bytes
2017-06-02 17:09:36,295 : INFO : resetting layer weights
2017-06-

Parsing datasets sentences
Training Word2Vec Model


2017-06-02 17:09:37,344 : INFO : PROGRESS: at 91.27% examples, 566814 words/s, in_qsize 5, out_qsize 0
2017-06-02 17:09:37,414 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-06-02 17:09:37,423 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-06-02 17:09:37,434 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-06-02 17:09:37,434 : INFO : training on 942525 raw words (628544 effective words) took 1.1s, 570927 effective words/s
2017-06-02 17:09:37,435 : INFO : precomputing L2-norms of word weight vectors
2017-06-02 17:09:37,437 : INFO : saving Word2Vec object under Results/yeast_strict_kkas_model, separately None
2017-06-02 17:09:37,437 : INFO : not storing attribute syn0norm
2017-06-02 17:09:37,438 : INFO : not storing attribute cum_table
2017-06-02 17:09:37,463 : INFO : saved Results/yeast_strict_kkas_model


Parsing datasets sentences


2017-06-02 17:09:37,990 : INFO : collecting all words and their counts
2017-06-02 17:09:37,991 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:09:38,040 : INFO : PROGRESS: at sentence #10000, processed 241924 words, keeping 11195 word types
2017-06-02 17:09:38,091 : INFO : PROGRESS: at sentence #20000, processed 485055 words, keeping 14860 word types
2017-06-02 17:09:38,148 : INFO : PROGRESS: at sentence #30000, processed 727846 words, keeping 17547 word types


Training Word2Vec Model


2017-06-02 17:09:38,205 : INFO : PROGRESS: at sentence #40000, processed 969967 words, keeping 19750 word types
2017-06-02 17:09:38,245 : INFO : collected 21147 word types from a corpus of 1155938 raw words and 47662 sentences
2017-06-02 17:09:38,246 : INFO : Loading a fresh vocabulary
2017-06-02 17:09:38,271 : INFO : min_count=5 retains 8077 unique words (38% of original 21147, drops 13070)
2017-06-02 17:09:38,272 : INFO : min_count=5 leaves 1134115 word corpus (98% of original 1155938, drops 21823)
2017-06-02 17:09:38,292 : INFO : deleting the raw counts dictionary of 21147 items
2017-06-02 17:09:38,294 : INFO : sample=0.001 downsamples 46 most-common words
2017-06-02 17:09:38,294 : INFO : downsampling leaves estimated 829411 word corpus (73.1% of prior 1134115)
2017-06-02 17:09:38,295 : INFO : estimated required memory for 8077 words and 100 dimensions: 10500100 bytes
2017-06-02 17:09:38,324 : INFO : resetting layer weights
2017-06-02 17:09:38,404 : INFO : training model with 3 work

Parsing datasets sentences


2017-06-02 17:09:48,643 : INFO : collecting all words and their counts
2017-06-02 17:09:48,644 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:09:48,687 : INFO : PROGRESS: at sentence #10000, processed 228368 words, keeping 12408 word types
2017-06-02 17:09:48,747 : INFO : PROGRESS: at sentence #20000, processed 455332 words, keeping 16984 word types
2017-06-02 17:09:48,814 : INFO : PROGRESS: at sentence #30000, processed 686390 words, keeping 20127 word types


Training Word2Vec Model


2017-06-02 17:09:48,870 : INFO : PROGRESS: at sentence #40000, processed 915781 words, keeping 22805 word types
2017-06-02 17:09:48,921 : INFO : PROGRESS: at sentence #50000, processed 1145029 words, keeping 25157 word types
2017-06-02 17:09:48,973 : INFO : PROGRESS: at sentence #60000, processed 1375391 words, keeping 27216 word types
2017-06-02 17:09:49,022 : INFO : PROGRESS: at sentence #70000, processed 1605672 words, keeping 29012 word types
2017-06-02 17:09:49,060 : INFO : PROGRESS: at sentence #80000, processed 1834240 words, keeping 30678 word types
2017-06-02 17:09:49,101 : INFO : PROGRESS: at sentence #90000, processed 2061794 words, keeping 32258 word types
2017-06-02 17:09:49,142 : INFO : PROGRESS: at sentence #100000, processed 2291681 words, keeping 33687 word types
2017-06-02 17:09:49,181 : INFO : PROGRESS: at sentence #110000, processed 2521584 words, keeping 35002 word types
2017-06-02 17:09:49,223 : INFO : PROGRESS: at sentence #120000, processed 2749912 words, keepin

In [16]:
yeast_strict_model = make_w2v_model(yeast_strict, 'yeast_strict_kkas')
yeast_gen_model = make_w2v_model(yeast_gen, 'yeast_gen_kkas')
yeast_be_model = make_w2v_model(yeast_be, 'yeast_be_kkas')

2017-06-02 17:11:16,975 : INFO : collecting all words and their counts
2017-06-02 17:11:16,975 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:11:17,011 : INFO : collected 11431 word types from a corpus of 188505 raw words and 7145 sentences
2017-06-02 17:11:17,011 : INFO : Loading a fresh vocabulary
2017-06-02 17:11:17,028 : INFO : min_count=5 retains 3346 unique words (29% of original 11431, drops 8085)
2017-06-02 17:11:17,028 : INFO : min_count=5 leaves 175425 word corpus (93% of original 188505, drops 13080)
2017-06-02 17:11:17,037 : INFO : deleting the raw counts dictionary of 11431 items
2017-06-02 17:11:17,039 : INFO : sample=0.001 downsamples 41 most-common words
2017-06-02 17:11:17,042 : INFO : downsampling leaves estimated 125739 word corpus (71.7% of prior 175425)
2017-06-02 17:11:17,043 : INFO : estimated required memory for 3346 words and 100 dimensions: 6357400 bytes
2017-06-02 17:11:17,047 : INFO : constructing a huffman tree from

Parsing datasets sentences
Training Word2Vec Model


2017-06-02 17:11:17,896 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-06-02 17:11:17,900 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-06-02 17:11:17,900 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-06-02 17:11:17,901 : INFO : training on 942525 raw words (628224 effective words) took 0.7s, 854562 effective words/s
2017-06-02 17:11:17,901 : INFO : precomputing L2-norms of word weight vectors
2017-06-02 17:11:17,903 : INFO : saving Word2Vec object under Results/yeast_strict_kkas_model, separately None
2017-06-02 17:11:17,904 : INFO : not storing attribute syn0norm
2017-06-02 17:11:17,904 : INFO : not storing attribute cum_table
2017-06-02 17:11:17,967 : INFO : saved Results/yeast_strict_kkas_model


Parsing datasets sentences


2017-06-02 17:11:18,530 : INFO : collecting all words and their counts
2017-06-02 17:11:18,531 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:11:18,576 : INFO : PROGRESS: at sentence #10000, processed 241924 words, keeping 11195 word types
2017-06-02 17:11:18,622 : INFO : PROGRESS: at sentence #20000, processed 485055 words, keeping 14860 word types
2017-06-02 17:11:18,667 : INFO : PROGRESS: at sentence #30000, processed 727846 words, keeping 17547 word types
2017-06-02 17:11:18,714 : INFO : PROGRESS: at sentence #40000, processed 969967 words, keeping 19750 word types


Training Word2Vec Model


2017-06-02 17:11:18,748 : INFO : collected 21147 word types from a corpus of 1155938 raw words and 47662 sentences
2017-06-02 17:11:18,749 : INFO : Loading a fresh vocabulary
2017-06-02 17:11:18,771 : INFO : min_count=5 retains 8077 unique words (38% of original 21147, drops 13070)
2017-06-02 17:11:18,772 : INFO : min_count=5 leaves 1134115 word corpus (98% of original 1155938, drops 21823)
2017-06-02 17:11:18,793 : INFO : deleting the raw counts dictionary of 21147 items
2017-06-02 17:11:18,794 : INFO : sample=0.001 downsamples 46 most-common words
2017-06-02 17:11:18,795 : INFO : downsampling leaves estimated 829411 word corpus (73.1% of prior 1134115)
2017-06-02 17:11:18,796 : INFO : estimated required memory for 8077 words and 100 dimensions: 15346300 bytes
2017-06-02 17:11:18,808 : INFO : constructing a huffman tree from 8077 words
2017-06-02 17:11:19,179 : INFO : built huffman tree with maximum node depth 18
2017-06-02 17:11:19,202 : INFO : resetting layer weights
2017-06-02 17:1

Parsing datasets sentences


2017-06-02 17:11:29,139 : INFO : collecting all words and their counts
2017-06-02 17:11:29,139 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:11:29,179 : INFO : PROGRESS: at sentence #10000, processed 228368 words, keeping 12408 word types
2017-06-02 17:11:29,219 : INFO : PROGRESS: at sentence #20000, processed 455332 words, keeping 16984 word types
2017-06-02 17:11:29,260 : INFO : PROGRESS: at sentence #30000, processed 686390 words, keeping 20127 word types
2017-06-02 17:11:29,303 : INFO : PROGRESS: at sentence #40000, processed 915781 words, keeping 22805 word types


Training Word2Vec Model


2017-06-02 17:11:29,349 : INFO : PROGRESS: at sentence #50000, processed 1145029 words, keeping 25157 word types
2017-06-02 17:11:29,398 : INFO : PROGRESS: at sentence #60000, processed 1375391 words, keeping 27216 word types
2017-06-02 17:11:29,446 : INFO : PROGRESS: at sentence #70000, processed 1605672 words, keeping 29012 word types
2017-06-02 17:11:29,490 : INFO : PROGRESS: at sentence #80000, processed 1834240 words, keeping 30678 word types
2017-06-02 17:11:29,536 : INFO : PROGRESS: at sentence #90000, processed 2061794 words, keeping 32258 word types
2017-06-02 17:11:29,585 : INFO : PROGRESS: at sentence #100000, processed 2291681 words, keeping 33687 word types
2017-06-02 17:11:29,633 : INFO : PROGRESS: at sentence #110000, processed 2521584 words, keeping 35002 word types
2017-06-02 17:11:29,673 : INFO : PROGRESS: at sentence #120000, processed 2749912 words, keeping 36257 word types
2017-06-02 17:11:29,714 : INFO : PROGRESS: at sentence #130000, processed 2978987 words, keep

In [18]:
yeast_strict_model = make_w2v_model(yeast_strict, 'yeast_strict_kkas')
yeast_gen_model = make_w2v_model(yeast_gen, 'yeast_gen_kkas')
yeast_be_model = make_w2v_model(yeast_be, 'yeast_be_kkas')

2017-06-02 17:12:37,960 : INFO : collecting all words and their counts
2017-06-02 17:12:37,961 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:12:37,997 : INFO : collected 11431 word types from a corpus of 188505 raw words and 7145 sentences
2017-06-02 17:12:37,998 : INFO : Loading a fresh vocabulary
2017-06-02 17:12:38,012 : INFO : min_count=5 retains 3346 unique words (29% of original 11431, drops 8085)
2017-06-02 17:12:38,013 : INFO : min_count=5 leaves 175425 word corpus (93% of original 188505, drops 13080)
2017-06-02 17:12:38,023 : INFO : deleting the raw counts dictionary of 11431 items
2017-06-02 17:12:38,024 : INFO : sample=0.001 downsamples 41 most-common words
2017-06-02 17:12:38,024 : INFO : downsampling leaves estimated 125739 word corpus (71.7% of prior 175425)
2017-06-02 17:12:38,025 : INFO : estimated required memory for 3346 words and 100 dimensions: 6357400 bytes
2017-06-02 17:12:38,030 : INFO : constructing a huffman tree from

Parsing datasets sentences
Training Word2Vec Model


2017-06-02 17:12:39,160 : INFO : PROGRESS: at 37.13% examples, 230653 words/s, in_qsize 5, out_qsize 0
2017-06-02 17:12:40,199 : INFO : PROGRESS: at 77.35% examples, 237288 words/s, in_qsize 5, out_qsize 0
2017-06-02 17:12:40,667 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-06-02 17:12:40,689 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-06-02 17:12:40,690 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-06-02 17:12:40,691 : INFO : training on 942525 raw words (628787 effective words) took 2.5s, 247452 effective words/s
2017-06-02 17:12:40,692 : INFO : precomputing L2-norms of word weight vectors
2017-06-02 17:12:40,695 : INFO : saving Word2Vec object under Results/yeast_strict_kkas_model, separately None
2017-06-02 17:12:40,696 : INFO : not storing attribute syn0norm
2017-06-02 17:12:40,696 : INFO : not storing attribute cum_table
2017-06-02 17:12:40,761 : INFO : saved Results/yeast_strict_kkas_model


Parsing datasets sentences


2017-06-02 17:12:41,268 : INFO : collecting all words and their counts
2017-06-02 17:12:41,269 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:12:41,311 : INFO : PROGRESS: at sentence #10000, processed 241924 words, keeping 11195 word types
2017-06-02 17:12:41,356 : INFO : PROGRESS: at sentence #20000, processed 485055 words, keeping 14860 word types
2017-06-02 17:12:41,401 : INFO : PROGRESS: at sentence #30000, processed 727846 words, keeping 17547 word types
2017-06-02 17:12:41,446 : INFO : PROGRESS: at sentence #40000, processed 969967 words, keeping 19750 word types


Training Word2Vec Model


2017-06-02 17:12:41,481 : INFO : collected 21147 word types from a corpus of 1155938 raw words and 47662 sentences
2017-06-02 17:12:41,482 : INFO : Loading a fresh vocabulary
2017-06-02 17:12:41,510 : INFO : min_count=5 retains 8077 unique words (38% of original 21147, drops 13070)
2017-06-02 17:12:41,511 : INFO : min_count=5 leaves 1134115 word corpus (98% of original 1155938, drops 21823)
2017-06-02 17:12:41,531 : INFO : deleting the raw counts dictionary of 21147 items
2017-06-02 17:12:41,532 : INFO : sample=0.001 downsamples 46 most-common words
2017-06-02 17:12:41,533 : INFO : downsampling leaves estimated 829411 word corpus (73.1% of prior 1134115)
2017-06-02 17:12:41,533 : INFO : estimated required memory for 8077 words and 100 dimensions: 15346300 bytes
2017-06-02 17:12:41,548 : INFO : constructing a huffman tree from 8077 words
2017-06-02 17:12:41,732 : INFO : built huffman tree with maximum node depth 18
2017-06-02 17:12:41,750 : INFO : resetting layer weights
2017-06-02 17:1

Parsing datasets sentences


2017-06-02 17:13:02,175 : INFO : collecting all words and their counts
2017-06-02 17:13:02,175 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-06-02 17:13:02,217 : INFO : PROGRESS: at sentence #10000, processed 228368 words, keeping 12408 word types
2017-06-02 17:13:02,265 : INFO : PROGRESS: at sentence #20000, processed 455332 words, keeping 16984 word types
2017-06-02 17:13:02,313 : INFO : PROGRESS: at sentence #30000, processed 686390 words, keeping 20127 word types
2017-06-02 17:13:02,360 : INFO : PROGRESS: at sentence #40000, processed 915781 words, keeping 22805 word types


Training Word2Vec Model


2017-06-02 17:13:02,405 : INFO : PROGRESS: at sentence #50000, processed 1145029 words, keeping 25157 word types
2017-06-02 17:13:02,454 : INFO : PROGRESS: at sentence #60000, processed 1375391 words, keeping 27216 word types
2017-06-02 17:13:02,500 : INFO : PROGRESS: at sentence #70000, processed 1605672 words, keeping 29012 word types
2017-06-02 17:13:02,546 : INFO : PROGRESS: at sentence #80000, processed 1834240 words, keeping 30678 word types
2017-06-02 17:13:02,587 : INFO : PROGRESS: at sentence #90000, processed 2061794 words, keeping 32258 word types
2017-06-02 17:13:02,629 : INFO : PROGRESS: at sentence #100000, processed 2291681 words, keeping 33687 word types
2017-06-02 17:13:02,672 : INFO : PROGRESS: at sentence #110000, processed 2521584 words, keeping 35002 word types
2017-06-02 17:13:02,713 : INFO : PROGRESS: at sentence #120000, processed 2749912 words, keeping 36257 word types
2017-06-02 17:13:02,754 : INFO : PROGRESS: at sentence #130000, processed 2978987 words, keep

In [22]:
def XGB_classifier(train_vector, test_vector,
                   labels_train, labels_test,
                   feature_selection=False):
    """Perform XGB Classification"""
    if feature_selection:
        clf = ExtraTreesClassifier(n_estimators=100)
        clf = clf.fit(train_vector, labels_train)
        model = SelectFromModel(clf, prefit=True)
        train_vector = model.transform(train_vector)
        test_vector = model.transform(test_vector)

    xgb_clf = XGBClassifier(seed=24)
    print ("\n Fitting XGBoost Model!")
    xgb_clf = xgb_clf.fit(train_vector, labels_train)
    print ("\n Making Predictions")
    result = xgb_clf.predict(test_vector)
    probs = xgb_clf.predict_proba(test_vector)[:, 1]
    predictions = [round(val) for val in result]
    error = get_accuracy(predictions, labels_test)
    return result, error, probs

In [23]:
def get_accuracy(l_new, l_te):
    """Calculates the accuracy of predicted labels, based on the given labels

    INPUT: New(Predicted) Labels, Test Labels

    OUTPUT: Error  """

    acc = 0

    for i in range(len(l_te)):
        if l_new[i] == l_te[i]:
            acc += 1

    acc = float(acc / len(l_te))

    return 1-acc

In [8]:
def modelfit(alg, train_vecs, train_labels, test_vecs, test_labels, w2v_model_type, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param=alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train_vecs, 
                              label=train_labels)
        cvresult = xgb.cv(xgb_param, 
                          xgtrain, 
                          num_boost_round=alg.get_params()['n_estimators'], 
                          nfold=cv_folds, 
                          metrics='auc', 
                          early_stopping_rounds=50)
        alg.set_params(n_estimators=cvresult.shape[0])
        
    #fit the algorithm on the data
    alg.fit(train_vecs, train_labels, eval_metric='auc')
    
    #Predict training set:
    test_predictions = alg.predict(test_vecs)
    test_predprob = alg.predict_proba(test_vecs)[:,1]
    
    #Metrics
    accuracy = metrics.accuracy_score(test_labels, test_predictions)
    roc_auc = metrics.roc_auc_score(test_labels, test_predprob)
    class_report = metrics.classification_report(test_labels, test_predictions)
    
    #Print Model report:
    print(w2v_model_type, '\nModel Report')
    print(w2v_model_type, 'Accuracy: %.4g' % accuracy)
    print(w2v_model_type, 'AUC Score (Train): %f' % roc_auc)
    print(w2v_model_type, 'Report \n', class_report)
    
    
#     feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
#     feat_imp.plot(kind='bar', title='Feature Importance')
#     plt.ylabel('Feature Importance Score')

    return accuracy, roc_auc, test_predictions, test_predprob, class_report

In [11]:
import time
name_of_result = 'yeast_OLD'
strict_data = yeast_strict
w2v_strict = yeast_strict_model
w2v_gen = yeast_gen_model
w2v_be = yeast_be_model
xgb_clf = XGBClassifier(seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = modelfit(xgb_clf, 
                                                                                                       entry[0], 
                                                                                                       entry[2], 
                                                                                                       entry[1], 
                                                                                                       entry[3], 
                                                                                                       model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        
print('Took ', time.time()-start, ' seconds')


Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.5256
SR 144 AUC Score (Train): 0.544850
SR 144 Report 
              precision    recall  f1-score   support

          0       0.48      0.83      0.61        35
          1       0.67      0.28      0.39        43

avg / total       0.58      0.53      0.49        78

GEN 144 
Model Report
GEN 144 Accuracy: 0.5897
GEN 144 AUC Score (Train): 0.654485
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.53      0.86      0.65        35
          1       0.76      0.37      0.50        43

avg / total       0.66      0.59      0.57        78

BE 144 
Model Report
BE 144 Accuracy: 0.5769
BE 144 AUC Score (Train): 0.714286
BE 144 Report 
              precision    recall  f1-score   support

          0       0.52      0.86      0.65        35
          1       0.75      0.35      0.48        43

avg / total       0.65      0.58      0.55        78


Predicting

SR 235 
Model Report
SR 235 Accuracy:

In [36]:
name_of_result = 'yeast_NEW'
strict_data = yeast_strict
w2v_strict = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_strict_w2v_model')
w2v_gen = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_gen_w2v_model')
w2v_be = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_be_w2v_model')
xgb_clf = XGBClassifier(learning_rate=0.1,
                        n_estimators=1000,
                        max_depth=6,
                        min_child_weight=1,
                        gamma=0.2,
                        subsample=0.6,
                        colsample_bytree=0.8,
                        reg_alpha=0.01,
                        objective='binary:logistic',
                        scale_pos_weight=1,
                        seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = modelfit(xgb_clf, 
                                                                                                       entry[0], 
                                                                                                       entry[2], 
                                                                                                       entry[1], 
                                                                                                       entry[3], 
                                                                                                       model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
print('Took ', time.time()-start, ' seconds')

2017-06-02 13:33:46,449 : INFO : loading Word2Vec object from ../../Results/Yeast/models/yeast_strict_w2v_model
2017-06-02 13:33:46,585 : INFO : loading wv recursively from ../../Results/Yeast/models/yeast_strict_w2v_model.wv.* with mmap=None
2017-06-02 13:33:46,585 : INFO : setting ignored attribute cum_table to None
2017-06-02 13:33:46,586 : INFO : setting ignored attribute syn0norm to None
2017-06-02 13:33:46,587 : INFO : loaded ../../Results/Yeast/models/yeast_strict_w2v_model
2017-06-02 13:33:46,592 : INFO : loading Word2Vec object from ../../Results/Yeast/models/yeast_gen_w2v_model
2017-06-02 13:33:46,937 : INFO : loading wv recursively from ../../Results/Yeast/models/yeast_gen_w2v_model.wv.* with mmap=None
2017-06-02 13:33:46,938 : INFO : setting ignored attribute syn0norm to None
2017-06-02 13:33:46,939 : INFO : setting ignored attribute cum_table to None
2017-06-02 13:33:46,939 : INFO : loaded ../../Results/Yeast/models/yeast_gen_w2v_model
2017-06-02 13:33:46,952 : INFO : load


Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.6094
SR 144 AUC Score (Train): 0.637638
SR 144 Report 
              precision    recall  f1-score   support

          0       0.64      0.73      0.68        37
          1       0.55      0.44      0.49        27

avg / total       0.60      0.61      0.60        64

GEN 144 
Model Report
GEN 144 Accuracy: 0.6719
GEN 144 AUC Score (Train): 0.684685
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.66      0.89      0.76        37
          1       0.71      0.37      0.49        27

avg / total       0.68      0.67      0.64        64

BE 144 
Model Report
BE 144 Accuracy: 0.6562
BE 144 AUC Score (Train): 0.727728
BE 144 Report 
              precision    recall  f1-score   support

          0       0.65      0.89      0.75        37
          1       0.69      0.33      0.45        27

avg / total       0.67      0.66      0.62        64



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 235 
Model Report
SR 235 Accuracy: 0.7101
SR 235 AUC Score (Train): 0.735993
SR 235 Report 
              precision    recall  f1-score   support

          0       0.71      0.79      0.75        38
          1       0.70      0.61      0.66        31

avg / total       0.71      0.71      0.71        69

GEN 235 
Model Report
GEN 235 Accuracy: 0.6957
GEN 235 AUC Score (Train): 0.723260
GEN 235 Report 
              precision    recall  f1-score   support

          0       0.71      0.76      0.73        38
          1       0.68      0.61      0.64        31

avg / total       0.69      0.70      0.69        69

BE 235 
Model Report
BE 235 Accuracy: 0.6812
BE 235 AUC Score (Train): 0.721562
BE 235 Report 
              precision    recall  f1-score   support

          0       0.69      0.76      0.72        38
          1       0.67      0.58      0.62        31

avg / total       0.68      0.68      0.68        69


Predicting

SR 905 
Model Report
SR 905 Accuracy:

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 2895 
Model Report
SR 2895 Accuracy: 0.6739
SR 2895 AUC Score (Train): 0.760096
SR 2895 Report 
              precision    recall  f1-score   support

          0       0.69      0.77      0.73        52
          1       0.65      0.55      0.59        40

avg / total       0.67      0.67      0.67        92

GEN 2895 
Model Report
GEN 2895 Accuracy: 0.6957
GEN 2895 AUC Score (Train): 0.724519
GEN 2895 Report 
              precision    recall  f1-score   support

          0       0.69      0.83      0.75        52
          1       0.70      0.53      0.60        40

avg / total       0.70      0.70      0.69        92

BE 2895 
Model Report
BE 2895 Accuracy: 0.6522
BE 2895 AUC Score (Train): 0.716827
BE 2895 Report 
              precision    recall  f1-score   support

          0       0.64      0.87      0.74        52
          1       0.68      0.38      0.48        40

avg / total       0.66      0.65      0.63        92



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 3462 
Model Report
SR 3462 Accuracy: 0.6753
SR 3462 AUC Score (Train): 0.729420
SR 3462 Report 
              precision    recall  f1-score   support

          0       0.68      0.67      0.68        39
          1       0.67      0.68      0.68        38

avg / total       0.68      0.68      0.68        77

GEN 3462 
Model Report
GEN 3462 Accuracy: 0.7143
GEN 3462 AUC Score (Train): 0.731444
GEN 3462 Report 
              precision    recall  f1-score   support

          0       0.72      0.72      0.72        39
          1       0.71      0.71      0.71        38

avg / total       0.71      0.71      0.71        77

BE 3462 
Model Report
BE 3462 Accuracy: 0.7013
BE 3462 AUC Score (Train): 0.767881
BE 3462 Report 
              precision    recall  f1-score   support

          0       0.72      0.67      0.69        39
          1       0.68      0.74      0.71        38

avg / total       0.70      0.70      0.70        77


Predicting

SR 4225 
Model Report
SR 

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5056 
Model Report
SR 5056 Accuracy: 0.8235
SR 5056 AUC Score (Train): 0.867424
SR 5056 Report 
              precision    recall  f1-score   support

          0       0.90      0.82      0.86        22
          1       0.71      0.83      0.77        12

avg / total       0.83      0.82      0.83        34

GEN 5056 
Model Report
GEN 5056 Accuracy: 0.7353
GEN 5056 AUC Score (Train): 0.829545
GEN 5056 Report 
              precision    recall  f1-score   support

          0       0.84      0.73      0.78        22
          1       0.60      0.75      0.67        12

avg / total       0.76      0.74      0.74        34

BE 5056 
Model Report
BE 5056 Accuracy: 0.7941
BE 5056 AUC Score (Train): 0.818182
BE 5056 Report 
              precision    recall  f1-score   support

          0       0.89      0.77      0.83        22
          1       0.67      0.83      0.74        12

avg / total       0.81      0.79      0.80        34



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5192 
Model Report
SR 5192 Accuracy: 0.6582
SR 5192 AUC Score (Train): 0.759973
SR 5192 Report 
              precision    recall  f1-score   support

          0       0.55      0.84      0.67        32
          1       0.83      0.53      0.65        47

avg / total       0.72      0.66      0.66        79

GEN 5192 
Model Report
GEN 5192 Accuracy: 0.6456
GEN 5192 AUC Score (Train): 0.767287
GEN 5192 Report 
              precision    recall  f1-score   support

          0       0.54      0.78      0.64        32
          1       0.79      0.55      0.65        47

avg / total       0.69      0.65      0.65        79

BE 5192 
Model Report
BE 5192 Accuracy: 0.6709
BE 5192 AUC Score (Train): 0.754654
BE 5192 Report 
              precision    recall  f1-score   support

          0       0.56      0.88      0.68        32
          1       0.86      0.53      0.66        47

avg / total       0.74      0.67      0.67        79



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7751 
Model Report
SR 7751 Accuracy: 0.6
SR 7751 AUC Score (Train): 0.605600
SR 7751 Report 
              precision    recall  f1-score   support

          0       0.78      0.56      0.65        50
          1       0.44      0.68      0.53        25

avg / total       0.66      0.60      0.61        75

GEN 7751 
Model Report
GEN 7751 Accuracy: 0.56
GEN 7751 AUC Score (Train): 0.694400
GEN 7751 Report 
              precision    recall  f1-score   support

          0       0.76      0.50      0.60        50
          1       0.40      0.68      0.51        25

avg / total       0.64      0.56      0.57        75

BE 7751 
Model Report
BE 7751 Accuracy: 0.6267
BE 7751 AUC Score (Train): 0.685600
BE 7751 Report 
              precision    recall  f1-score   support

          0       0.81      0.58      0.67        50
          1       0.46      0.72      0.56        25

avg / total       0.69      0.63      0.64        75



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7813 
Model Report
SR 7813 Accuracy: 0.6234
SR 7813 AUC Score (Train): 0.688098
SR 7813 Report 
              precision    recall  f1-score   support

          0       0.56      0.65      0.60        34
          1       0.68      0.60      0.64        43

avg / total       0.63      0.62      0.62        77

GEN 7813 
Model Report
GEN 7813 Accuracy: 0.7273
GEN 7813 AUC Score (Train): 0.756498
GEN 7813 Report 
              precision    recall  f1-score   support

          0       0.67      0.76      0.71        34
          1       0.79      0.70      0.74        43

avg / total       0.74      0.73      0.73        77

BE 7813 
Model Report
BE 7813 Accuracy: 0.6883
BE 7813 AUC Score (Train): 0.720246
BE 7813 Report 
              precision    recall  f1-score   support

          0       0.67      0.59      0.62        34
          1       0.70      0.77      0.73        43

avg / total       0.69      0.69      0.69        77

Took  1404.1214997768402  seconds


In [37]:
name_of_result = 'yeast_NEW_w2v_OLD_xgb'
strict_data = yeast_strict
w2v_strict = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_strict_w2v_model')
w2v_gen = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_gen_w2v_model')
w2v_be = word2vec.Word2Vec.load('../../Results/Yeast/models/yeast_be_w2v_model')
xgb_clf = XGBClassifier(seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = modelfit(xgb_clf, 
                                                                                                       entry[0], 
                                                                                                       entry[2], 
                                                                                                       entry[1], 
                                                                                                       entry[3], 
                                                                                                       model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
print('Took ', time.time()-start, ' seconds')

2017-06-02 13:57:11,912 : INFO : loading Word2Vec object from ../../Results/Yeast/models/yeast_strict_w2v_model
2017-06-02 13:57:12,086 : INFO : loading wv recursively from ../../Results/Yeast/models/yeast_strict_w2v_model.wv.* with mmap=None
2017-06-02 13:57:12,087 : INFO : setting ignored attribute cum_table to None
2017-06-02 13:57:12,088 : INFO : setting ignored attribute syn0norm to None
2017-06-02 13:57:12,089 : INFO : loaded ../../Results/Yeast/models/yeast_strict_w2v_model
2017-06-02 13:57:12,106 : INFO : loading Word2Vec object from ../../Results/Yeast/models/yeast_gen_w2v_model
2017-06-02 13:57:12,559 : INFO : loading wv recursively from ../../Results/Yeast/models/yeast_gen_w2v_model.wv.* with mmap=None
2017-06-02 13:57:12,561 : INFO : setting ignored attribute syn0norm to None
2017-06-02 13:57:12,561 : INFO : setting ignored attribute cum_table to None
2017-06-02 13:57:12,562 : INFO : loaded ../../Results/Yeast/models/yeast_gen_w2v_model
2017-06-02 13:57:12,582 : INFO : load


Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.625
SR 144 AUC Score (Train): 0.691692
SR 144 Report 
              precision    recall  f1-score   support

          0       0.65      0.76      0.70        37
          1       0.57      0.44      0.50        27

avg / total       0.62      0.62      0.62        64

GEN 144 
Model Report
GEN 144 Accuracy: 0.7188
GEN 144 AUC Score (Train): 0.731732
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.69      0.92      0.79        37
          1       0.80      0.44      0.57        27

avg / total       0.74      0.72      0.70        64

BE 144 
Model Report
BE 144 Accuracy: 0.6406
BE 144 AUC Score (Train): 0.724725
BE 144 Report 
              precision    recall  f1-score   support

          0       0.65      0.84      0.73        37
          1       0.62      0.37      0.47        27

avg / total       0.64      0.64      0.62        64



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 235 
Model Report
SR 235 Accuracy: 0.6812
SR 235 AUC Score (Train): 0.717317
SR 235 Report 
              precision    recall  f1-score   support

          0       0.70      0.74      0.72        38
          1       0.66      0.61      0.63        31

avg / total       0.68      0.68      0.68        69

GEN 235 
Model Report
GEN 235 Accuracy: 0.7246
GEN 235 AUC Score (Train): 0.719015
GEN 235 Report 
              precision    recall  f1-score   support

          0       0.72      0.82      0.77        38
          1       0.73      0.61      0.67        31

avg / total       0.73      0.72      0.72        69

BE 235 
Model Report
BE 235 Accuracy: 0.6522
BE 235 AUC Score (Train): 0.729202
BE 235 Report 
              precision    recall  f1-score   support

          0       0.68      0.71      0.69        38
          1       0.62      0.58      0.60        31

avg / total       0.65      0.65      0.65        69


Predicting

SR 905 
Model Report
SR 905 Accuracy:

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 2895 
Model Report
SR 2895 Accuracy: 0.6087
SR 2895 AUC Score (Train): 0.754327
SR 2895 Report 
              precision    recall  f1-score   support

          0       0.62      0.77      0.69        52
          1       0.57      0.40      0.47        40

avg / total       0.60      0.61      0.59        92

GEN 2895 
Model Report
GEN 2895 Accuracy: 0.6522
GEN 2895 AUC Score (Train): 0.710577
GEN 2895 Report 
              precision    recall  f1-score   support

          0       0.67      0.77      0.71        52
          1       0.62      0.50      0.56        40

avg / total       0.65      0.65      0.65        92

BE 2895 
Model Report
BE 2895 Accuracy: 0.6848
BE 2895 AUC Score (Train): 0.758173
BE 2895 Report 
              precision    recall  f1-score   support

          0       0.67      0.87      0.76        52
          1       0.72      0.45      0.55        40

avg / total       0.69      0.68      0.67        92



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 3462 
Model Report
SR 3462 Accuracy: 0.7013
SR 3462 AUC Score (Train): 0.717949
SR 3462 Report 
              precision    recall  f1-score   support

          0       0.71      0.69      0.70        39
          1       0.69      0.71      0.70        38

avg / total       0.70      0.70      0.70        77

GEN 3462 
Model Report
GEN 3462 Accuracy: 0.7532
GEN 3462 AUC Score (Train): 0.763158
GEN 3462 Report 
              precision    recall  f1-score   support

          0       0.74      0.79      0.77        39
          1       0.77      0.71      0.74        38

avg / total       0.75      0.75      0.75        77

BE 3462 
Model Report
BE 3462 Accuracy: 0.7013
BE 3462 AUC Score (Train): 0.804993
BE 3462 Report 
              precision    recall  f1-score   support

          0       0.74      0.64      0.68        39
          1       0.67      0.76      0.72        38

avg / total       0.71      0.70      0.70        77


Predicting

SR 4225 
Model Report
SR 

  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5056 
Model Report
SR 5056 Accuracy: 0.8235
SR 5056 AUC Score (Train): 0.863636
SR 5056 Report 
              precision    recall  f1-score   support

          0       0.90      0.82      0.86        22
          1       0.71      0.83      0.77        12

avg / total       0.83      0.82      0.83        34

GEN 5056 
Model Report
GEN 5056 Accuracy: 0.7647
GEN 5056 AUC Score (Train): 0.814394
GEN 5056 Report 
              precision    recall  f1-score   support

          0       0.85      0.77      0.81        22
          1       0.64      0.75      0.69        12

avg / total       0.78      0.76      0.77        34

BE 5056 
Model Report
BE 5056 Accuracy: 0.7647
BE 5056 AUC Score (Train): 0.863636
BE 5056 Report 
              precision    recall  f1-score   support

          0       0.85      0.77      0.81        22
          1       0.64      0.75      0.69        12

avg / total       0.78      0.76      0.77        34



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 5192 
Model Report
SR 5192 Accuracy: 0.6962
SR 5192 AUC Score (Train): 0.789229
SR 5192 Report 
              precision    recall  f1-score   support

          0       0.58      0.91      0.71        32
          1       0.90      0.55      0.68        47

avg / total       0.77      0.70      0.69        79

GEN 5192 
Model Report
GEN 5192 Accuracy: 0.6835
GEN 5192 AUC Score (Train): 0.767952
GEN 5192 Report 
              precision    recall  f1-score   support

          0       0.57      0.91      0.70        32
          1       0.89      0.53      0.67        47

avg / total       0.76      0.68      0.68        79

BE 5192 
Model Report
BE 5192 Accuracy: 0.7468
BE 5192 AUC Score (Train): 0.815160
BE 5192 Report 
              precision    recall  f1-score   support

          0       0.64      0.88      0.74        32
          1       0.89      0.66      0.76        47

avg / total       0.78      0.75      0.75        79



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7751 
Model Report
SR 7751 Accuracy: 0.64
SR 7751 AUC Score (Train): 0.672800
SR 7751 Report 
              precision    recall  f1-score   support

          0       0.79      0.62      0.70        50
          1       0.47      0.68      0.56        25

avg / total       0.69      0.64      0.65        75

GEN 7751 
Model Report
GEN 7751 Accuracy: 0.6
GEN 7751 AUC Score (Train): 0.663200
GEN 7751 Report 
              precision    recall  f1-score   support

          0       0.76      0.58      0.66        50
          1       0.43      0.64      0.52        25

avg / total       0.65      0.60      0.61        75

BE 7751 
Model Report
BE 7751 Accuracy: 0.6267
BE 7751 AUC Score (Train): 0.670400
BE 7751 Report 
              precision    recall  f1-score   support

          0       0.78      0.62      0.69        50
          1       0.46      0.64      0.53        25

avg / total       0.67      0.63      0.64        75



  feature_vec = np.divide(feature_vec,nwords)



Predicting

SR 7813 
Model Report
SR 7813 Accuracy: 0.6623
SR 7813 AUC Score (Train): 0.704514
SR 7813 Report 
              precision    recall  f1-score   support

          0       0.61      0.65      0.63        34
          1       0.71      0.67      0.69        43

avg / total       0.66      0.66      0.66        77

GEN 7813 
Model Report
GEN 7813 Accuracy: 0.6623
GEN 7813 AUC Score (Train): 0.760602
GEN 7813 Report 
              precision    recall  f1-score   support

          0       0.61      0.65      0.63        34
          1       0.71      0.67      0.69        43

avg / total       0.66      0.66      0.66        77

BE 7813 
Model Report
BE 7813 Accuracy: 0.6494
BE 7813 AUC Score (Train): 0.751026
BE 7813 Report 
              precision    recall  f1-score   support

          0       0.60      0.62      0.61        34
          1       0.69      0.67      0.68        43

avg / total       0.65      0.65      0.65        77

Took  1503.6061532497406  seconds


In [12]:
name_of_result = 'yeast_OLD_w2v_NEW_XGB'
strict_data = yeast_strict
w2v_strict = yeast_strict_model
w2v_gen = yeast_gen_model
w2v_be = yeast_be_model
xgb_clf = XGBClassifier(learning_rate=0.1,
                        n_estimators=1000,
                        max_depth=6,
                        min_child_weight=1,
                        gamma=0.2,
                        subsample=0.6,
                        colsample_bytree=0.8,
                        reg_alpha=0.01,
                        objective='binary:logistic',
                        scale_pos_weight=1,
                        seed=24)
start = time.time()
for seed in random_seeds:
        strict_list_SR = pred.make_models(strict_data,
                                          name_of_result+'_SR_'+str(seed),
                                          prev_model=w2v_strict,
                                          ran_state=seed)

        strict_list_GEN = pred.make_models(strict_data,
                                           name_of_result+'_GEN_'+str(seed),
                                           prev_model=w2v_gen,
                                           ran_state=seed)
        strict_list_BE = pred.make_models(strict_data,
                                          name_of_result+'_BE_'+str(seed),
                                          prev_model=w2v_be,
                                          ran_state=seed)

        strict_final_list = [strict_list_SR,
                             strict_list_GEN,
                             strict_list_BE]

        print ('\nPredicting\n')
        accuracy = []
        probs = []
        fpr = []
        tpr = []
        labels = []
        auc_score = []
        report = []

        for entry, model_name in zip(strict_final_list, ['SR '+str(seed), 'GEN '+str(seed), 'BE '+str(seed)]):
            accuracy_norm, auc_score_norm, pred_labels_norm, probs_norm, class_report_norm  = modelfit(xgb_clf, 
                                                                                                       entry[0], 
                                                                                                       entry[2], 
                                                                                                       entry[1], 
                                                                                                       entry[3], 
                                                                                                       model_name)
            fpr_norm, tpr_norm, _ = roc_curve(entry[3], probs_norm)

            accuracy.append([accuracy_norm])
            probs.append([probs_norm])
            fpr.append([fpr_norm])
            tpr.append([tpr_norm])
            labels.append([pred_labels_norm])
            auc_score.append([auc_score_norm])
            report.append([class_report_norm])

        pickle.dump(accuracy, open('Results/'+name_of_result+'_accuracy_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(probs, open('Results/'+name_of_result+'_probs_pickle_'+str(seed)+'.pkl',
                                'wb'))
        pickle.dump(fpr, open('Results/'+name_of_result+'_fpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(tpr, open('Results/'+name_of_result+'_tpr_pickle_'+str(seed)+'.pkl',
                              'wb'))
        pickle.dump(labels, open('Results/'+name_of_result+'_labels_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(auc_score, open('Results/'+name_of_result+'_auc_score_pickle_'+str(seed)+'.pkl',
                                 'wb'))
        pickle.dump(report, open('Results/'+name_of_result+'_report_pickle_'+str(seed)+'.pkl',
                                 'wb'))
print('Took ', time.time()-start, ' seconds')


Predicting

SR 144 
Model Report
SR 144 Accuracy: 0.4744
SR 144 AUC Score (Train): 0.537542
SR 144 Report 
              precision    recall  f1-score   support

          0       0.45      0.71      0.55        35
          1       0.55      0.28      0.37        43

avg / total       0.50      0.47      0.45        78

GEN 144 
Model Report
GEN 144 Accuracy: 0.6154
GEN 144 AUC Score (Train): 0.657143
GEN 144 Report 
              precision    recall  f1-score   support

          0       0.54      0.89      0.67        35
          1       0.81      0.40      0.53        43

avg / total       0.69      0.62      0.60        78

BE 144 
Model Report
BE 144 Accuracy: 0.6667
BE 144 AUC Score (Train): 0.714286
BE 144 Report 
              precision    recall  f1-score   support

          0       0.59      0.86      0.70        35
          1       0.81      0.51      0.63        43

avg / total       0.71      0.67      0.66        78


Predicting



KeyboardInterrupt: 