# Validation and Training

## Import Packages and Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,hamming_loss
from sklearn.model_selection import ShuffleSplit

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

import pickle

import warnings
warnings.filterwarnings('ignore')

In [3]:
data_train = pd.read_csv("../Data/Data_train_full_tfidf.csv")
data_train.head()

Unnamed: 0,ab,aba,abal,abang,ac,acara,acid,actually,adil,adipati,...,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,1,0,0


In [4]:
data_test = pd.read_csv("../Data/Data_test_full_tfidf.csv")
data_test.head()

Unnamed: 0,ab,aba,abal,abang,ac,acara,acid,actually,adil,adipati,...,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Features Target Split

In [5]:
X_train = data_train.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train = data_train[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

In [6]:
X_test = data_test.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_test = data_test[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

## Cross-Validation for Hyperparameter Tuning

**1. Binary Relevance**

In [7]:
classifier_BR = BinaryRelevance(RandomForestClassifier())

**2. Classifier Chains**

In [8]:
classifier_CC = ClassifierChain(RandomForestClassifier())

**3. Label Powerset**

In [9]:
classifier_LP = LabelPowerset(RandomForestClassifier())

### Random Search Cross Validation

In [10]:
random_grid = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)],
               'classifier__max_features': ['auto', 'log2', None],
               'classifier__max_depth': [20, 40, 60, 80, 100, None],
               'classifier__min_samples_split': [2, 5, 10],
               'classifier__min_samples_leaf': [1, 2, 4],
               'classifier__bootstrap': [True, False]
                     }

**1. Binary Relevance**

In [10]:
random_search_BR = RandomizedSearchCV(estimator=classifier_BR,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring=None,
                                   cv=3, 
                                   verbose=15, 
                                   random_state=8,
                                   n_jobs = 2)

In [None]:
random_search_BR.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed: 32.2min
[Parallel(n_jobs=2)]: Done   2 tasks      | elapsed: 32.6min
[Parallel(n_jobs=2)]: Done   3 tasks      | elapsed: 65.4min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed: 137.6min
[Parallel(n_jobs=2)]: Done   5 tasks      | elapsed: 173.3min
[Parallel(n_jobs=2)]: Done   6 tasks      | elapsed: 244.8min
[Parallel(n_jobs=2)]: Done   7 tasks      | elapsed: 250.9min
[Parallel(n_jobs=2)]: Done   8 tasks      | elapsed: 323.0min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 328.8min
[Parallel(n_jobs=2)]: Done  10 tasks      | elapsed: 328.9min
[Parallel(n_jobs=2)]: Done  11 tasks      | elapsed: 334.4min
[Parallel(n_jobs=2)]: Done  12 tasks      | elapsed: 334.6min
[Parallel(n_jobs=2)]: Done  13 tasks      | elapsed: 337.0min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 337.2min
[Parallel(n_jobs=2)]: Done  15 tasks      | elapsed: 339.8mi

In [12]:
random_search_BR.best_params_

{'classifier__n_estimators': 1000,
 'classifier__min_samples_split': 2,
 'classifier__min_samples_leaf': 1,
 'classifier__max_features': 'log2',
 'classifier__max_depth': 60,
 'classifier__bootstrap': False}

In [13]:
print("Best Params : ",random_search_BR.best_params_)
print()
means = random_search_BR.cv_results_['mean_test_score']
stds = random_search_BR.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, random_search_BR.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__n_estimators': 1000, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'log2', 'classifier__max_depth': 60, 'classifier__bootstrap': False}

0.521 (+/-0.003) for {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'log2', 'classifier__max_depth': None, 'classifier__bootstrap': True}
0.539 (+/-0.005) for {'classifier__n_estimators': 600, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'auto', 'classifier__max_depth': 60, 'classifier__bootstrap': False}
0.535 (+/-0.004) for {'classifier__n_estimators': 1000, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'auto', 'classifier__max_depth': 40, 'classifier__bootstrap': True}
0.506 (+/-0.011) for {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_sam

**2. Classifier Chains**

In [26]:
random_search_CC = RandomizedSearchCV(estimator=classifier_CC,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring=None,
                                   cv=3, 
                                   verbose=15,
                                   n_jobs = -1,
                                   random_state=8)

In [27]:
random_search_CC.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed: 10

[Parallel(n_jobs=-1)]: Done 131 tasks      | elapsed: 368.4min
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed: 368.5min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 381.8min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed: 393.2min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed: 395.5min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 405.8min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 406.2min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 406.4min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed: 406.4min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed: 417.2min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed: 417.4min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 417.8min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed: 418.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 445.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=ClassifierChain(classifier=RandomForestClassifier(bootstrap=True,
                                                                               class_weight=None,
                                                                               criterion='gini',
                                                                               max_depth=None,
                                                                               max_features='auto',
                                                                               max_leaf_nodes=None,
                                                                               min_impurity_decrease=0.0,
                                                                               min_impurity_split=None,
                                                                               min_samples_leaf=1,
                                                

In [28]:
print("Best Params : ",random_search_CC.best_params_)
print()
means = random_search_CC.cv_results_['mean_test_score']
stds = random_search_CC.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, random_search_CC.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__n_estimators': 1000, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'auto', 'classifier__max_depth': 80, 'classifier__bootstrap': True}

0.603 (+/-0.016) for {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'log2', 'classifier__max_depth': None, 'classifier__bootstrap': True}
0.613 (+/-0.012) for {'classifier__n_estimators': 600, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'auto', 'classifier__max_depth': 60, 'classifier__bootstrap': False}
0.610 (+/-0.015) for {'classifier__n_estimators': 1000, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'auto', 'classifier__max_depth': 40, 'classifier__bootstrap': True}
0.597 (+/-0.009) for {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_samp

**3. Label Powerset**

In [11]:
random_search_LP = RandomizedSearchCV(estimator=classifier_LP,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring=None,
                                   cv=3, 
                                   verbose=15,
                                   n_jobs = -1,
                                   random_state=8
                                     )

In [12]:
random_search_LP.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  6

[Parallel(n_jobs=-1)]: Done 131 tasks      | elapsed: 720.0min
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed: 721.2min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 742.2min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed: 763.1min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed: 783.4min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 793.4min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 802.2min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 808.1min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed: 811.2min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed: 817.0min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed: 827.0min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 827.1min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed: 827.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 858.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=LabelPowerset(classifier=RandomForestClassifier(bootstrap=True,
                                                                             class_weight=None,
                                                                             criterion='gini',
                                                                             max_depth=None,
                                                                             max_features='auto',
                                                                             max_leaf_nodes=None,
                                                                             min_impurity_decrease=0.0,
                                                                             min_impurity_split=None,
                                                                             min_samples_leaf=1,
                                                                  

In [13]:
print("Best Params : ",random_search_LP.best_params_)
print()
means = random_search_LP.cv_results_['mean_test_score']
stds = random_search_LP.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, random_search_LP.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__n_estimators': 600, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'auto', 'classifier__max_depth': None, 'classifier__bootstrap': True}

0.608 (+/-0.019) for {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'log2', 'classifier__max_depth': None, 'classifier__bootstrap': True}
0.586 (+/-0.015) for {'classifier__n_estimators': 600, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'auto', 'classifier__max_depth': 60, 'classifier__bootstrap': False}
0.552 (+/-0.017) for {'classifier__n_estimators': 1000, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'auto', 'classifier__max_depth': 40, 'classifier__bootstrap': True}
0.447 (+/-0.020) for {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_sa

In [46]:
# classifier_BR.get_params().keys()

### Grid Search Cross Validation

In [14]:
param_grid_BR = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 50, stop = 1000, num = 20)],
               'classifier__max_features': ['log2'],
               'classifier__min_samples_split': [2],
               'classifier__min_samples_leaf': [1],
               'classifier__bootstrap': [True]
                     }

param_grid_CC = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 50, stop = 1000, num = 20)],
               'classifier__max_features': ['auto'],
               'classifier__min_samples_split': [5],
               'classifier__min_samples_leaf': [1],
               'classifier__bootstrap': [True]
                     }

param_grid_LP = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 50, stop = 1000, num = 20)],
               'classifier__max_features': ['auto'],
               'classifier__min_samples_split': [10],
               'classifier__min_samples_leaf': [1],
               'classifier__bootstrap': [True]
                     }

**1. Binary Relevance**

In [15]:
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

grid_search_BR = GridSearchCV(estimator=classifier_BR, 
                           param_grid=param_grid_BR,
                           cv=cv_sets,
                           n_jobs = -1,
                           verbose=15)

grid_search_BR.fit(X_train,y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   40.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  7

GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=BinaryRelevance(classifier=RandomForestClassifier(bootstrap=True,
                                                                         class_weight=None,
                                                                         criterion='gini',
                                                                         max_depth=None,
                                                                         max_features='auto',
                                                                         max_leaf_nodes=None,
                                                                         min_impurity_decrease=0.0,
                                                                         min_impurity_split=None,
                                                                         min_samples_leaf...
             iid='warn'

In [16]:
print("Best Params : ",grid_search_BR.best_params_)
print()
means = grid_search_BR.cv_results_['mean_test_score']
stds = grid_search_BR.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_search_BR.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__bootstrap': True, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 1000}

0.508 (+/-0.005) for {'classifier__bootstrap': True, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
0.516 (+/-0.003) for {'classifier__bootstrap': True, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
0.518 (+/-0.003) for {'classifier__bootstrap': True, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 150}
0.519 (+/-0.004) for {'classifier__bootstrap': True, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
0.521 (+/-0.010) for {'classifier__boots

**2. Classifier Chains**

In [31]:
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

grid_search_CC = GridSearchCV(estimator=classifier_CC, 
                           param_grid=param_grid_CC,
                           cv=cv_sets,
                           n_jobs = -1,
                           verbose=15)

grid_search_CC.fit(X_train,y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  3

GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=ClassifierChain(classifier=RandomForestClassifier(bootstrap=True,
                                                                         class_weight=None,
                                                                         criterion='gini',
                                                                         max_depth=None,
                                                                         max_features='auto',
                                                                         max_leaf_nodes=None,
                                                                         min_impurity_decrease=0.0,
                                                                         min_impurity_split=None,
                                                                         min_samples_leaf...
             iid='warn'

In [32]:
print("Best Params : ",grid_search_CC.best_params_)
print()
means = grid_search_CC.cv_results_['mean_test_score']
stds = grid_search_CC.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_search_CC.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 850}

0.594 (+/-0.018) for {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}
0.596 (+/-0.013) for {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
0.598 (+/-0.015) for {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 150}
0.598 (+/-0.015) for {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
0.598 (+/-0.019) for {'classifier__bootst

**3. Label Powerset**

In [15]:
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

grid_search_LP = GridSearchCV(estimator=classifier_LP, 
                           param_grid=param_grid_LP,
                           cv=cv_sets,
                           n_jobs = -1,
                           verbose=15)

grid_search_LP.fit(X_train,y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  3

GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=8, test_size=0.33, train_size=None),
             error_score='raise-deprecating',
             estimator=LabelPowerset(classifier=RandomForestClassifier(bootstrap=True,
                                                                       class_weight=None,
                                                                       criterion='gini',
                                                                       max_depth=None,
                                                                       max_features='auto',
                                                                       max_leaf_nodes=None,
                                                                       min_impurity_decrease=0.0,
                                                                       min_impurity_split=None,
                                                                       min_samples_leaf=1...
             iid='warn', n_jobs=-1,
   

In [16]:
print("Best Params : ",grid_search_LP.best_params_)
print()
means = grid_search_LP.cv_results_['mean_test_score']
stds = grid_search_LP.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_search_LP.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 950}

0.617 (+/-0.008) for {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 50}
0.619 (+/-0.011) for {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
0.619 (+/-0.015) for {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 150}
0.620 (+/-0.006) for {'classifier__bootstrap': True, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
0.618 (+/-0.008) for {'classifier__b

## Classification

In [17]:
classifier_BR = grid_search_BR.best_estimator_

In [33]:
classifier_CC = grid_search_CC.best_estimator_

In [17]:
classifier_LP = grid_search_LP.best_estimator_

**1. Binary Relevance**

In [18]:
classifier_BR.fit(X_train,y_train)

BinaryRelevance(classifier=RandomForestClassifier(bootstrap=True,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='log2',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=1000,
                                                  n_jobs=None, oob_score=False,
                                                  random_state=None

In [23]:
# save the model to disk
filename = '../Model/BR_model.sav'
pickle.dump(classifier_BR, open(filename, 'wb'))

**2. Classifier Chains**

In [34]:
classifier_CC.fit(X_train,y_train)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=True,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=850, n_jobs=None,
                                                  oob_score=False,
                                                  random_state=None,

In [37]:
# save the model to disk
filename = '../Model/CC_model.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

**3. Label Powerset**

In [18]:
classifier_LP.fit(X_train,y_train)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=True,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=950, n_jobs=None,
                                                oob_score=False,
                                                random_state=None, verbose=0,
               

In [19]:
# save the model to disk
filename = '../Model/LP_model_MI.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))