## This Notebook is for training different ML models on Doc2Vec 

In [4]:
import pickle
import warnings
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.utils import shuffle
from molearn.classifiers.BR import BR
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from molearn.classifiers.Ensemble import Ensemble
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell
from sklearn.metrics import classification_report,f1_score
from molearn.classifiers.classifier_chains import CC,RCC,MCC

In [5]:
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = "all"

### Define some useful methods

In [6]:
def Hamming_loss(Ytest,Ypred):
    ''' Hamming loss aka Hamming distance '''
    return 1.-Hamming_score(Ytest,Ypred)

def Hamming_score(Ytest,Ypred):
    ''' Hamming score aka Hamming match '''
    N_test,L = Ytest.shape
    return np.sum((Ytest == Ypred) * 1.) / N_test / L

def Hamming_matches(Ytest,Ypred):
    N_test,L = Ytest.shape
    return np.sum((Ytest == Ypred) * 1.,axis=0) / N_test 

def Hamming_losses(Ytest,Ypred):
    return 1.-Hamming_matches(Ytest,Ypred)

def Exact_match(Ytest,Ypred):
    N_test,L = Ytest.shape
    return np.sum(np.sum((Ytest == Ypred) * 1,axis=1)==L) * 1. / N_test

In [7]:
def grid_search(train_x, train_y, test_x, test_y, parameters, pipeline):
    grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=-1, verbose=10)
    grid_search_tune.fit(train_x, train_y)

    print
    print("Best parameters set:")
    print (grid_search_tune.best_estimator_.steps)
    print

    # measuring performance on test set
    print ("Applying best classifier on test data:")
    best_clf = grid_search_tune.best_estimator_
    
    predictions = best_clf.predict(test_x)
    print('grid_search_tune.best_estimator : ',grid_search_tune.best_estimator_.steps[0])
    return best_clf

In [8]:
## save model
def save_model(clf,filename,folder):
    pickle.dump(clf, open("/home/sina/trained-model/"+folder+"/"+filename, 'wb'))

## classification methods

In [9]:
def logisticRegression(x_train, x_test, y_train, y_test):
        
    print ("LogisticRegression")
    pipeline = Pipeline([
        ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
    ])
    parameters = {
        "clf__estimator__C": [0.1,1,10],
        "clf__estimator__class_weight": [None],
    }
    clf = grid_search(x_train, y_train, x_test, y_test, parameters, pipeline)
    return clf

In [10]:
def adaboost(x_train, x_test, y_train, y_test):
        
    print ("LogisticRegression")
    pipeline = Pipeline([
        ('clf', OneVsRestClassifier(AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    algorithm="SAMME"), n_jobs=-1)),
    ])
    parameters = {
        "clf__estimator__learning_rate": [1,1.5],
        "clf__estimator__n_estimators": [6],
    }
    clf = grid_search(x_train, y_train, x_test, y_test, parameters, pipeline)
    return clf

In [11]:
def naivebayes(x_train, x_test, y_train, y_test):
        
    print ("LogisticRegression")
    pipeline = Pipeline([
        ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None), n_jobs=-1)),
    ])
    parameters = {
        'clf__estimator__alpha': (1e-2, 1e-3)
    }
    clf = grid_search(x_train, y_train, x_test, y_test, parameters, pipeline)
    return clf

In [12]:
def svm(x_train, x_test, y_train, y_test):


    print ("LinearSVC")
    pipeline = Pipeline([
        ('clf', OneVsRestClassifier(SVC(), n_jobs=-1)),
    ])
    
    parameters ={
         'clf__estimator__kernel': ['rbf'],
         'clf__estimator__gamma': [1e-3],
         'clf__estimator__C': [10]
        }
        
    clf = grid_search(x_train, y_train, x_test, y_test, parameters, pipeline)    
    return clf

## 1.Doc2Vec

### 1.1 Reading The data

In [13]:
x_train = pd.read_csv('/home/sina/input/Doc2Vec/x_data_train_doc2vec.csv')
y_train = pd.read_csv('/home/sina/input/Doc2Vec/y_data_train_doc2vec.csv')
x_test = pd.read_csv('/home/sina/input/Doc2Vec/x_data_test_doc2vec.csv')
y_test = pd.read_csv('/home/sina/input/Doc2Vec/y_data_test_doc2vec.csv')
x_train = shuffle(x_train.as_matrix())
y_train = shuffle(y_train.as_matrix())
x_test = shuffle(x_test.as_matrix())
y_test = shuffle(y_test.as_matrix())
x_train = x_train[0:100000]
y_train = y_train[0:100000]
x_test = x_test[0:40000]
y_test = y_test[0:40000]
print('x_train shape: ', x_train.shape)
print('y_train shape: ', y_train.shape)
print('x_test shape: ', x_test.shape)
print('y_test shape: ', y_test.shape)

x_train shape:  (100000, 300)
y_train shape:  (100000, 28)
x_test shape:  (40000, 300)
y_test shape:  (40000, 28)


#### 1.3 shape of train and test

In [15]:
print('x_train shape: ', x_train.shape)
print('y_train shape: ', y_train.shape)
print('x_test shape: ', x_test.shape)
print('y_test shape: ', y_test.shape)

x_train shape:  (100000, 300)
y_train shape:  (100000, 28)
x_test shape:  (40000, 300)
y_test shape:  (40000, 28)


### 1.4 apply different algorithm

#### 1.4.1 Logistic regression

##### 1.4.1.1 OneVsRestClassifier

In [42]:
bst_clf_logisticregression_OneVsRestClassifier_Doc2Vec=logisticRegression(x_train, x_test, y_train, y_test)
save_model(bst_clf_logisticregression_OneVsRestClassifier_Doc2Vec,'logisticregression_OneVsRestClassifier_Doc2Vec.sav','Doc2Vec')
prediction_logisticregression_OneVsRestClassifier_Doc2Vec = bst_clf_logisticregression_OneVsRestClassifier_Doc2Vec.predict(x_test)

LogisticRegression
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] clf__estimator__C=0.1, clf__estimator__class_weight=None ........
[CV] clf__estimator__C=0.1, clf__estimator__class_weight=None ........
[CV] clf__estimator__C=1, clf__estimator__class_weight=None ..........
[CV] clf__estimator__C=1, clf__estimator__class_weight=None ..........
[CV] clf__estimator__C=10, clf__estimator__class_weight=None .........
[CV] clf__estimator__C=10, clf__estimator__class_weight=None .........
[CV]  clf__estimator__C=0.1, clf__estimator__class_weight=None, score=0.00324, total= 5.4min
[CV]  clf__estimator__C=10, clf__estimator__class_weight=None, score=0.00328, total= 5.9min


[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:  5.9min remaining: 11.9min


[CV]  clf__estimator__C=10, clf__estimator__class_weight=None, score=0.00236, total= 6.0min


[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  6.0min remaining:  6.0min


[CV]  clf__estimator__C=0.1, clf__estimator__class_weight=None, score=0.0024, total= 6.2min


[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:  6.2min remaining:  3.1min


[CV]  clf__estimator__C=1, clf__estimator__class_weight=None, score=0.00328, total= 6.2min
[CV]  clf__estimator__C=1, clf__estimator__class_weight=None, score=0.00236, total= 6.4min


[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  6.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  6.4min finished


Best parameters set:
[('clf', OneVsRestClassifier(estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=-1))]
Applying best classifier on test data:
grid_search_tune.best_estimator :  ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=-1))


##### 1.4.1.2 ClassifierChain

In [43]:
chains = [ClassifierChain(LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', solver='sag', tol=0.0001,
          verbose=0, warm_start=False), order='random', random_state=i)
          for i in range(10)]

for idx,chain in enumerate(chains):
    if idx%4==0 or idx==10:
        print(idx,"chain")
    chain.fit(x_train, y_train)

0 chain


ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=0)

ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=1)

ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=2)

ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=3)

4 chain


ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=4)

ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=5)

ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=6)

ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=7)

8 chain


ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=8)

ClassifierChain(base_estimator=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False),
        cv=None, order='random', random_state=9)

In [44]:
y_pred_chains = np.array([chain.predict(x_test) for chain in
                          chains])
predict_logisticregression_ensemble_ClassifierChain_Doc2Vec = y_pred_chains.mean(axis=0)

temp = []
org = []
for i in range(len(predict_logisticregression_ensemble_ClassifierChain_Doc2Vec)):
    for j in range(len(predict_logisticregression_ensemble_ClassifierChain_Doc2Vec[0])):
        temp.append(int(predict_logisticregression_ensemble_ClassifierChain_Doc2Vec[i][j]+0.8))
    org.append(temp)
    temp = []
    
predict_logisticregression_ensemble_ClassifierChain_Doc2Vec = np.asarray(org) 

In [45]:
cc = CC(h=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', solver='sag', tol=0.0001,
          verbose=0, warm_start=False))
cc.fit(x_train,y_train)
save_model(cc,'LogisticRegression_Doc2Vec_CC.sav','Doc2Vec')

<molearn.classifiers.classifier_chains.CC at 0x7f0bb2445ba8>

In [46]:
predict_logisticregression_CC_ClassifierChain_Doc2Vec=cc.predict(x_test)

In [47]:
br = BR(h=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', solver='sag', tol=0.0001,
          verbose=0, warm_start=False))
br.fit(x_train,y_train)
save_model(br,'LogisticRegression_Doc2Vec_BR.sav','Doc2Vec')

<molearn.classifiers.BR.BR at 0x7f0bb21a5400>

In [48]:
predict_logisticregression_BR_ClassifierChain_Doc2Vec=br.predict(x_test)

In [49]:
mcc = MCC(h=LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', solver='sag', tol=0.0001,
          verbose=0, warm_start=False))
mcc.fit(x_train,y_train)
save_model(mcc,'LogisticRegression_Doc2Vec_MCC.sav','Doc2Vec')

<molearn.classifiers.classifier_chains.MCC at 0x7f0ba689ab00>

In [50]:
predict_logisticregression_MCC_ClassifierChain_Doc2Vec=mcc.predict(x_test)

##### 1.4.1.3 visualization and scores

In [51]:
predict_logisticregression_ensemble_ClassifierChain_Doc2Vec.shape

(20000, 28)

In [52]:
f1_score_prediction_logisticregression_OneVsRestClassifier_Doc2Vec = f1_score(y_test,prediction_logisticregression_OneVsRestClassifier_Doc2Vec,average='weighted')
f1_score_prediction_logisticregression_pure_ClassifierChain_Doc2Vec = f1_score(y_test,predict_logisticregression_ensemble_ClassifierChain_Doc2Vec,average='weighted')
f1_score_logisticregression_CC_ClassifierChain_Doc2Vec = f1_score(y_test,predict_logisticregression_CC_ClassifierChain_Doc2Vec,average='weighted')
f1_score_logisticregression_BR_ClassifierChain_Doc2Vec = f1_score(y_test,predict_logisticregression_BR_ClassifierChain_Doc2Vec,average='weighted')
f1_score_logisticregression_MCC_ClassifierChain_Doc2Vec = f1_score(y_test,predict_logisticregression_MCC_ClassifierChain_Doc2Vec,average='weighted')

print('f1_score_prediction_logisticregression_OneVsRestClassifier_Doc2Vec: ',f1_score_prediction_logisticregression_OneVsRestClassifier_Doc2Vec)
print('f1_score_prediction_logisticregression_pure_chain_Doc2Vec: ',f1_score_prediction_logisticregression_pure_ClassifierChain_Doc2Vec)
print('f1_score_logisticregression_CC_ClassifierChain_Doc2Vec: ',f1_score_logisticregression_CC_ClassifierChain_Doc2Vec)
print('f1_score_logisticregression_BR_ClassifierChain_Doc2Vec: ',f1_score_logisticregression_BR_ClassifierChain_Doc2Vec)
print('f1_score_logisticregression_MCC_ClassifierChain_Doc2Vec: ',f1_score_logisticregression_MCC_ClassifierChain_Doc2Vec)

f1_score_prediction_logisticregression_OneVsRestClassifier_Doc2Vec:  4.57596568791e-05
f1_score_prediction_logisticregression_pure_chain_Doc2Vec:  0.220237363199
f1_score_logisticregression_CC_ClassifierChain_Doc2Vec:  0.161522908015
f1_score_logisticregression_BR_ClassifierChain_Doc2Vec:  0.059404578538
f1_score_logisticregression_MCC_ClassifierChain_Doc2Vec:  0.142731164064


In [53]:
Hamming_loss_prediction_logisticregression_OneVsRestClassifier_Doc2Vec = Hamming_loss(y_test,prediction_logisticregression_OneVsRestClassifier_Doc2Vec)
Hamming_loss_prediction_logisticregression_pure_ClassifierChain_Doc2Vec = Hamming_loss(y_test,predict_logisticregression_ensemble_ClassifierChain_Doc2Vec)
Hamming_loss_logisticregression_CC_ClassifierChain_Doc2Vec = Hamming_loss(y_test,predict_logisticregression_CC_ClassifierChain_Doc2Vec)
Hamming_loss_logisticregression_BR_ClassifierChain_Doc2Vec = Hamming_loss(y_test,predict_logisticregression_BR_ClassifierChain_Doc2Vec)
Hamming_loss_logisticregression_MCC_ClassifierChain_Doc2Vec = Hamming_loss(y_test,predict_logisticregression_MCC_ClassifierChain_Doc2Vec)

print('Hamming_loss_prediction_logisticregression_OneVsRestClassifier_Doc2Vec: ',Hamming_loss_prediction_logisticregression_OneVsRestClassifier_Doc2Vec)
print('Hamming_loss_prediction_logisticregression_pure_chain_Doc2Vec: ',Hamming_loss_prediction_logisticregression_pure_ClassifierChain_Doc2Vec)
print('Hamming_loss_logisticregression_CC_ClassifierChain_Doc2Vec: ',Hamming_loss_logisticregression_CC_ClassifierChain_Doc2Vec)
print('Hamming_loss_logisticregression_BR_ClassifierChain_Doc2Vec: ',Hamming_loss_logisticregression_BR_ClassifierChain_Doc2Vec)
print('Hamming_loss_logisticregression_MCC_ClassifierChain_Doc2Vec: ',Hamming_loss_logisticregression_MCC_ClassifierChain_Doc2Vec)

Hamming_loss_prediction_logisticregression_OneVsRestClassifier_Doc2Vec:  0.0780696428571
Hamming_loss_prediction_logisticregression_pure_chain_Doc2Vec:  0.121964285714
Hamming_loss_logisticregression_CC_ClassifierChain_Doc2Vec:  0.14205
Hamming_loss_logisticregression_BR_ClassifierChain_Doc2Vec:  0.0839267857143
Hamming_loss_logisticregression_MCC_ClassifierChain_Doc2Vec:  0.199630357143


In [54]:
Hamming_score_prediction_logisticregression_OneVsRestClassifier_Doc2Vec = Hamming_score(y_test,prediction_logisticregression_OneVsRestClassifier_Doc2Vec)
Hamming_score_prediction_logisticregression_pure_ClassifierChain_Doc2Vec = Hamming_score(y_test,predict_logisticregression_ensemble_ClassifierChain_Doc2Vec)
Hamming_score_logisticregression_CC_ClassifierChain_Doc2Vec = Hamming_score(y_test,predict_logisticregression_CC_ClassifierChain_Doc2Vec)
Hamming_score_logisticregression_BR_ClassifierChain_Doc2Vec = Hamming_score(y_test,predict_logisticregression_BR_ClassifierChain_Doc2Vec)
Hamming_score_logisticregression_MCC_ClassifierChain_Doc2Vec = Hamming_score(y_test,predict_logisticregression_MCC_ClassifierChain_Doc2Vec)

print('Hamming_score_prediction_logisticregression_OneVsRestClassifier_Doc2Vec: ',Hamming_score_prediction_logisticregression_OneVsRestClassifier_Doc2Vec)
print('Hamming_score_prediction_logisticregression_pure_chain_Doc2Vec: ',Hamming_score_prediction_logisticregression_pure_ClassifierChain_Doc2Vec)
print('Hamming_score_logisticregression_CC_ClassifierChain_Doc2Vec: ',Hamming_score_logisticregression_CC_ClassifierChain_Doc2Vec)
print('Hamming_score_logisticregression_BR_ClassifierChain_Doc2Vec: ',Hamming_score_logisticregression_BR_ClassifierChain_Doc2Vec)
print('Hamming_score_logisticregression_MCC_ClassifierChain_Doc2Vec: ',Hamming_score_logisticregression_MCC_ClassifierChain_Doc2Vec)

Hamming_score_prediction_logisticregression_OneVsRestClassifier_Doc2Vec:  0.921930357143
Hamming_score_prediction_logisticregression_pure_chain_Doc2Vec:  0.878035714286
Hamming_score_logisticregression_CC_ClassifierChain_Doc2Vec:  0.85795
Hamming_score_logisticregression_BR_ClassifierChain_Doc2Vec:  0.916073214286
Hamming_score_logisticregression_MCC_ClassifierChain_Doc2Vec:  0.800369642857


In [55]:
Exact_match_prediction_logisticregression_OneVsRestClassifier_Doc2Vec = Exact_match(y_test,prediction_logisticregression_OneVsRestClassifier_Doc2Vec)
Exact_match_prediction_logisticregression_pure_ClassifierChain_Doc2Vec = Exact_match(y_test,predict_logisticregression_ensemble_ClassifierChain_Doc2Vec)
Exact_match_logisticregression_CC_ClassifierChain_Doc2Vec = Exact_match(y_test,predict_logisticregression_CC_ClassifierChain_Doc2Vec)
Exact_match_logisticregression_BR_ClassifierChain_Doc2Vec = Exact_match(y_test,predict_logisticregression_BR_ClassifierChain_Doc2Vec)
Exact_match_logisticregression_MCC_ClassifierChain_Doc2Vec = Exact_match(y_test,predict_logisticregression_MCC_ClassifierChain_Doc2Vec)

print('Exact_match_prediction_logisticregression_OneVsRestClassifier_Doc2Vec: ',Exact_match_prediction_logisticregression_OneVsRestClassifier_Doc2Vec)
print('Exact_match_prediction_logisticregression_pure_chain_Doc2Vec: ',Exact_match_prediction_logisticregression_pure_ClassifierChain_Doc2Vec)
print('Exact_match_logisticregression_CC_ClassifierChain_Doc2Vec: ',Exact_match_logisticregression_CC_ClassifierChain_Doc2Vec)
print('Exact_match_logisticregression_BR_ClassifierChain_Doc2Vec: ',Exact_match_logisticregression_BR_ClassifierChain_Doc2Vec)
print('Exact_match_logisticregression_MCC_ClassifierChain_Doc2Vec: ',Exact_match_logisticregression_MCC_ClassifierChain_Doc2Vec)

Exact_match_prediction_logisticregression_OneVsRestClassifier_Doc2Vec:  0.0
Exact_match_prediction_logisticregression_pure_chain_Doc2Vec:  0.0022
Exact_match_logisticregression_CC_ClassifierChain_Doc2Vec:  0.01635
Exact_match_logisticregression_BR_ClassifierChain_Doc2Vec:  0.0074
Exact_match_logisticregression_MCC_ClassifierChain_Doc2Vec:  0.00295


#### 1.4.2 DecisionTreeClassifier

In [56]:
DecisionTreeClassifier_Doc2Vec = DecisionTreeClassifier(min_samples_split=2, random_state=1)
DecisionTreeClassifier_Doc2Vec.fit(x_train,y_train)
save_model(DecisionTreeClassifier_Doc2Vec,'DecisionTreeClassifier_Doc2Vec.sav','Doc2Vec')

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [57]:
predict_DecisionTreeClassifier_Doc2Vec = DecisionTreeClassifier_Doc2Vec.predict(x_test)

In [58]:
f1_score_DecisionTreeClassifier_Doc2Vec = f1_score(y_test,predict_DecisionTreeClassifier_Doc2Vec,average='weighted')

print('f1_score_prediction_DecisionTreeClassifier_Doc2Vec: ',f1_score_DecisionTreeClassifier_Doc2Vec)

f1_score_prediction_DecisionTreeClassifier_Doc2Vec:  0.207377540685


In [59]:
Hamming_loss_DecisionTreeClassifier_Doc2Vec = Hamming_loss(y_test,predict_DecisionTreeClassifier_Doc2Vec)

print('Hamming_loss_DecisionTreeClassifier_Doc2Vec: ',Hamming_loss_DecisionTreeClassifier_Doc2Vec)


Hamming_loss_DecisionTreeClassifier_Doc2Vec:  0.120939285714


In [60]:
Hamming_score_DecisionTreeClassifier_Doc2Vec = Hamming_score(y_test,predict_DecisionTreeClassifier_Doc2Vec)

print('Hamming_score_DecisionTreeClassifier_Doc2Vec: ',Hamming_score_DecisionTreeClassifier_Doc2Vec)


Hamming_score_DecisionTreeClassifier_Doc2Vec:  0.879060714286


In [61]:
Exact_match_DecisionTreeClassifier_Doc2Vec = Exact_match(y_test,predict_DecisionTreeClassifier_Doc2Vec)

print('Exact_match_DecisionTreeClassifier_Doc2Vec: ',Exact_match_DecisionTreeClassifier_Doc2Vec)


Exact_match_DecisionTreeClassifier_Doc2Vec:  0.0266


## BPMLL

In [16]:
from keras import backend as K

# bp mll loss function
# y_true, y_pred must be 2D tensors of shape (batch dimension, number of labels)
# y_true must satisfy y_true[i][j] == 1 iff sample i has label j
def bp_mll_loss(y_true, y_pred):
 
    # get true and false labels
    y_i = K.equal(y_true, K.ones_like(y_true))
    y_i_bar = K.not_equal(y_true, K.ones_like(y_true))
    
    # cast to float as keras backend has no logical and
    y_i = K.cast(y_i, dtype='float32')
    y_i_bar = K.cast(y_i_bar, dtype='float32')

    # get indices to check
    truth_matrix = pairwise_and(y_i, y_i_bar)

    # calculate all exp'd differences
    sub_matrix = pairwise_sub(y_pred, y_pred)
    exp_matrix = K.exp(-sub_matrix)

    # check which differences to consider and sum them
    sparse_matrix = exp_matrix * truth_matrix
    sums = K.sum(sparse_matrix, axis=[1,2])

    # get normalizing terms and apply them
    y_i_sizes = K.sum(y_i, axis=1)
    y_i_bar_sizes = K.sum(y_i_bar, axis=1)
    normalizers = y_i_sizes * y_i_bar_sizes
    results = sums / normalizers

    # sum over samples
    return K.sum(results)


# compute pairwise differences between elements of the tensors a and b
def pairwise_sub(a, b):
    column = K.expand_dims(a, 2)
    row = K.expand_dims(b, 1)
    return column - row

# compute pairwise logical and between elements of the tensors a and b
def pairwise_and(a, b):
    column = K.expand_dims(a, 2)
    row = K.expand_dims(b, 1)
    return K.minimum(column, row)


In [17]:
n = x_train.shape[0]
dim_no = x_train.shape[1]
class_no = y_train.shape[1]

In [18]:
from keras.models import Sequential
from keras.layers import Dense, Dropout



# create simple mlp
model = Sequential()
model.add(Dense(300, input_dim=dim_no, activation='relu', kernel_initializer='glorot_uniform'))
model.add(Dropout(0.6))
model.add(Dense(128, activation='relu', kernel_initializer='glorot_uniform'))
model.add(Dropout(0.6))
model.add(Dense(64, activation='relu', kernel_initializer='glorot_uniform'))
model.add(Dense(class_no, activation='sigmoid', kernel_initializer='glorot_uniform'))
model.compile(loss=bp_mll_loss, optimizer='adagrad', metrics=[])

# train a few epochs
model.fit(x_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f977e9ab358>

In [19]:

bpmll_Word2Vec = model.predict(x_test)


#report =classification_report(Y_test, bpmll_Word2Vec)
print (len(bpmll_Word2Vec))

print (bpmll_Word2Vec.shape)

b = bpmll_Word2Vec[2][0]

a = bpmll_Word2Vec

res = []
tmp = []
for j in range(bpmll_Word2Vec.shape[0]):
	for i in range(bpmll_Word2Vec.shape[1]):
		tmp.append(int(bpmll_Word2Vec[j][i]+0.01))
	res.append(tmp)
	tmp = []	

print(bpmll_Word2Vec[2])

print(a)
print(y_test[2])
res = pd.DataFrame(res)
res = res.as_matrix()
print(res,'res')
report =classification_report(y_test, res)
print (report)

40000
(40000, 28)
[3.1791953e-06 2.7951914e-06 2.6871163e-01 9.9998665e-01 5.5849114e-07
 1.6256499e-07 1.0344197e-05 3.1594253e-01 5.4229391e-01 9.9999940e-01
 2.2325020e-01 1.3683565e-06 2.4109389e-01 3.6745474e-03 7.5281230e-07
 9.9998951e-01 4.2138898e-01 5.7031051e-03 1.6633892e-01 2.7855486e-02
 3.8611558e-01 7.5701493e-05 3.3685689e-07 1.0000000e+00 1.4891527e-02
 7.0054483e-07 6.7208472e-05 8.8137954e-02]
[[1.3273411e-05 7.6335245e-06 6.4799851e-01 ... 1.6033935e-06
  2.2616122e-04 1.7970707e-01]
 [4.1749777e-06 2.0389450e-06 2.7775377e-01 ... 1.0875337e-06
  6.9740898e-05 7.9124309e-02]
 [3.1791953e-06 2.7951914e-06 2.6871163e-01 ... 7.0054483e-07
  6.7208472e-05 8.8137954e-02]
 ...
 [5.0294384e-06 3.2310497e-06 6.4135981e-01 ... 5.4245908e-07
  1.0316181e-04 1.4684361e-01]
 [1.0198415e-05 7.7914710e-06 3.6501291e-01 ... 3.0050001e-06
  2.0624149e-04 1.4864203e-01]
 [7.2894146e-04 6.4270879e-04 2.5764889e-01 ... 2.3743801e-04
  5.9386650e-03 2.4242853e-01]]
[0 0 0 0 1 0 0 0 0 

In [20]:
Hamming_loss(y_test,res)

0.13350089285714284

In [21]:
Hamming_score(y_test,res)

0.8664991071428572

In [22]:
Exact_match(y_test,res)

0.000925

In [26]:
f1_score(y_test,res,average='weighted')

0.27176459750501497