# Validation and Training

## Import Packages and Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,hamming_loss
from sklearn.model_selection import ShuffleSplit

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_train_full = pd.read_csv("../Data/Data_train_full_tfidf.csv")
data_train_nostopword = pd.read_csv("../Data/Data_train_nostopword_tfidf.csv")
data_train_nostemstop = pd.read_csv("../Data/Data_train_nostemstop_tfidf.csv")

In [3]:
data_train_trans_full = pd.read_csv("../Data/Data_train_trans_full_tfidf.csv")
data_train_trans_nostopword = pd.read_csv("../Data/Data_train_trans_nostopword_tfidf.csv")
data_train_trans_nostemstop = pd.read_csv("../Data/Data_train_trans_nostemstop_tfidf.csv")

## Features Target Split

In [4]:
X_train_full = data_train_full.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_full = data_train_full[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_train_nostopword = data_train_nostopword.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_nostopword = data_train_nostopword[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_train_nostemstop = data_train_nostemstop.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_nostemstop = data_train_nostemstop[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

In [5]:
X_train_trans_full = data_train_trans_full.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_trans_full = data_train_trans_full[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_train_trans_nostopword = data_train_trans_nostopword.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_trans_nostopword = data_train_trans_nostopword[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_train_trans_nostemstop = data_train_trans_nostemstop.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_trans_nostemstop = data_train_trans_nostemstop[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

## Cross-Validation for Hyperparameter Tuning

**1. Classifier Chains**

In [13]:
classifier_CC = ClassifierChain(RandomForestClassifier(n_jobs=-1))

**2. Label Powerset**

In [11]:
classifier_LP = LabelPowerset(RandomForestClassifier(n_jobs=-1))

**3. Binary Relevance**

In [6]:
classifier_BR = BinaryRelevance(RandomForestClassifier(n_jobs=-1))

### Random Search Cross Validation

In [8]:
random_grid = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)],
               'classifier__max_features': ['auto', 'log2'],
               'classifier__min_samples_split': [2, 5, 10],
               'classifier__min_samples_leaf': [1, 2, 4],
               'classifier__bootstrap': [True, False]
                     }

**1. Classifier Chains**

In [13]:
cv_sets = ShuffleSplit(n_splits = 1, test_size = 0.10, random_state = 8)

random_search_CC = RandomizedSearchCV(estimator=classifier_CC,
                                   param_distributions=random_grid,
                                   n_iter=10,
                                   scoring=None,
                                   cv=cv_sets, 
                                   verbose=15,
                                   n_jobs = -1)

In [14]:
random_search_CC.fit(X_train_full, y_train_full)

Fitting 1 folds for each of 10 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed: 27.2min remaining: 40.8min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 54.0min remaining: 54.0min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 63.8min remaining: 42.5min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 74.9min remaining: 32.1min
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed: 77.4min remaining: 19.4min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 103.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 103.0min finished


RandomizedSearchCV(cv=ShuffleSplit(n_splits=1, random_state=8, test_size=0.1, train_size=None),
                   error_score='raise-deprecating',
                   estimator=ClassifierChain(classifier=RandomForestClassifier(bootstrap=True,
                                                                               class_weight=None,
                                                                               criterion='gini',
                                                                               max_depth=None,
                                                                               max_features='auto',
                                                                               max_leaf_nodes=None,
                                                                               min_impurity_decrease=0.0,
                                                                               min_impurity_split=None,
                                                         

In [15]:
print("Best Params : ",random_search_CC.best_params_)
print()
means = random_search_CC.cv_results_['mean_test_score']
stds = random_search_CC.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, random_search_CC.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__n_estimators': 700, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'auto', 'classifier__bootstrap': False}

0.625 (+/-0.000) for {'classifier__n_estimators': 100, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 'auto', 'classifier__bootstrap': False}
0.629 (+/-0.000) for {'classifier__n_estimators': 400, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 'auto', 'classifier__bootstrap': True}
0.650 (+/-0.000) for {'classifier__n_estimators': 800, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'auto', 'classifier__bootstrap': True}
0.568 (+/-0.000) for {'classifier__n_estimators': 300, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'log2', 'classifier__bootstrap': True}
0.626 (+/-0.000) for {'classifier__n_

**2. Label Powerset**

In [17]:
cv_sets = ShuffleSplit(n_splits = 1, test_size = 0.10, random_state = 8)

random_search_LP = RandomizedSearchCV(estimator=classifier_LP,
                                   param_distributions=random_grid,
                                   n_iter=10,
                                   scoring=None,
                                   cv=cv_sets, 
                                   verbose=15,
                                   n_jobs = -1)

In [18]:
random_search_LP.fit(X_train_full, y_train_full)

Fitting 1 folds for each of 10 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed: 21.9min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed: 35.9min remaining: 53.9min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 56.7min remaining: 56.7min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 57.0min remaining: 38.0min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 57.5min remaining: 24.6min
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed: 70.2min remaining: 17.5min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 84.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 84.2min finished


RandomizedSearchCV(cv=ShuffleSplit(n_splits=1, random_state=8, test_size=0.1, train_size=None),
                   error_score='raise-deprecating',
                   estimator=LabelPowerset(classifier=RandomForestClassifier(bootstrap=True,
                                                                             class_weight=None,
                                                                             criterion='gini',
                                                                             max_depth=None,
                                                                             max_features='auto',
                                                                             max_leaf_nodes=None,
                                                                             min_impurity_decrease=0.0,
                                                                             min_impurity_split=None,
                                                                         

In [19]:
print("Best Params : ",random_search_LP.best_params_)
print()
means = random_search_LP.cv_results_['mean_test_score']
stds = random_search_LP.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, random_search_LP.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__n_estimators': 500, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'auto', 'classifier__bootstrap': False}

0.629 (+/-0.000) for {'classifier__n_estimators': 400, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 'auto', 'classifier__bootstrap': False}
0.671 (+/-0.000) for {'classifier__n_estimators': 1000, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'log2', 'classifier__bootstrap': False}
0.665 (+/-0.000) for {'classifier__n_estimators': 600, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'log2', 'classifier__bootstrap': True}
0.499 (+/-0.000) for {'classifier__n_estimators': 200, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 'log2', 'classifier__bootstrap': True}
0.632 (+/-0.000) for {'classifier

### Grid Search Cross Validation

In [12]:
param_grid_CC = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 600, stop = 1000, num = 5)],
               'classifier__max_features': ['auto'],
               'classifier__min_samples_split': [5],
               'classifier__min_samples_leaf': [1],
               'classifier__bootstrap': [False]
                     }

param_grid_LP = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 600, stop = 1000, num = 5)],
               'classifier__max_features': ['auto'],
               'classifier__min_samples_split': [10],
               'classifier__min_samples_leaf': [1],
               'classifier__bootstrap': [False]
                     }

param_grid_BR = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 600, stop = 1000, num = 5)],
               'classifier__max_features': ['log2'],
               'classifier__min_samples_split': [5],
               'classifier__min_samples_leaf': [1],
               'classifier__bootstrap': [True]
                     }

**1. Classifier Chains**

In [13]:
cv_sets = ShuffleSplit(n_splits = 1, test_size = .10, random_state = 8)

grid_search_CC = GridSearchCV(estimator=classifier_CC, 
                           param_grid=param_grid_CC,
                           cv=cv_sets,
                           n_jobs = -1,
                           verbose=15)

grid_search_CC.fit(X_train_full,y_train_full)

Fitting 1 folds for each of 5 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 90.1min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 104.7min remaining: 157.1min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 115.1min remaining: 76.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 147.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 147.9min finished


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=8, test_size=0.1, train_size=None),
             error_score='raise-deprecating',
             estimator=ClassifierChain(classifier=RandomForestClassifier(bootstrap=True,
                                                                         class_weight=None,
                                                                         criterion='gini',
                                                                         max_depth=None,
                                                                         max_features='auto',
                                                                         max_leaf_nodes=None,
                                                                         min_impurity_decrease=0.0,
                                                                         min_impurity_split=None,
                                                                         min_samples_leaf=...
                       

In [14]:
print("Best Params : ",grid_search_CC.best_params_)
print()
means = grid_search_CC.cv_results_['mean_test_score']
stds = grid_search_CC.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_search_CC.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 600}

0.673 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 600}
0.666 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 700}
0.669 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 800}
0.668 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 900}
0.666 (+/-0.000) for {'classifier__

**2. Label Powerset**

In [8]:
cv_sets = ShuffleSplit(n_splits = 1, test_size = .10, random_state = 8)

grid_search_LP = GridSearchCV(estimator=classifier_LP, 
                           param_grid=param_grid_LP,
                           cv=cv_sets,
                           n_jobs = -1,
                           verbose=15)

grid_search_LP.fit(X_train_full,y_train_full)

Fitting 1 folds for each of 5 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 42.6min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 49.5min remaining: 74.2min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 55.3min remaining: 36.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 67.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 67.7min finished


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=8, test_size=0.1, train_size=None),
             error_score='raise-deprecating',
             estimator=LabelPowerset(classifier=RandomForestClassifier(bootstrap=True,
                                                                       class_weight=None,
                                                                       criterion='gini',
                                                                       max_depth=None,
                                                                       max_features='auto',
                                                                       max_leaf_nodes=None,
                                                                       min_impurity_decrease=0.0,
                                                                       min_impurity_split=None,
                                                                       min_samples_leaf=1,...
                                       

In [9]:
print("Best Params : ",grid_search_LP.best_params_)
print()
means = grid_search_LP.cv_results_['mean_test_score']
stds = grid_search_LP.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_search_LP.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

Best Params :  {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 800}

0.677 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 600}
0.675 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 700}
0.678 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 800}
0.676 (+/-0.000) for {'classifier__bootstrap': False, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 900}
0.675 (+/-0.000) for {'classif

## Classification

In [15]:
classifier_CC = grid_search_CC.best_estimator_

In [14]:
classifier_CC

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [10]:
classifier_LP = grid_search_LP.best_estimator_

In [15]:
classifier_LP

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [27]:
# for best scenario only
classifier_BR = grid_search_BR.best_estimator_

In [28]:
classifier_BR

BinaryRelevance(classifier=RandomForestClassifier(bootstrap=True,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='log2',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=800, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, v

**1. Classifier Chains**

In [8]:
classifier_CC.fit(X_train_full,y_train_full)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [9]:
# save the model to disk
filename = '../Model/CC_model_full.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [10]:
classifier_CC.fit(X_train_nostopword,y_train_nostopword)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [11]:
# save the model to disk
filename = '../Model/CC_model_nostopword.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [12]:
classifier_CC.fit(X_train_nostemstop,y_train_nostemstop)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [13]:
# save the model to disk
filename = '../Model/CC_model_nostemstop.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [16]:
classifier_CC.fit(X_train_trans_full,y_train_trans_full)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [17]:
# save the model to disk
filename = '../Model/CC_model_trans_full.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [18]:
classifier_CC.fit(X_train_trans_nostopword,y_train_trans_nostopword)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [19]:
# save the model to disk
filename = '../Model/CC_model_trans_nostopword.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

In [20]:
classifier_CC.fit(X_train_trans_nostemstop,y_train_trans_nostemstop)

ClassifierChain(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [21]:
# save the model to disk
filename = '../Model/CC_model_trans_nostemstop.sav'
pickle.dump(classifier_CC, open(filename, 'wb'))

**2. Label Powerset**

In [5]:
classifier_LP.fit(X_train_full,y_train_full)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [6]:
# save the model to disk
filename = '../Model/LP_model_full.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [7]:
classifier_LP.fit(X_train_nostopword,y_train_nostopword)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [8]:
# save the model to disk
filename = '../Model/LP_model_nostopword.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [12]:
classifier_LP.fit(X_train_nostemstop,y_train_nostemstop)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=5,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=600, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                 

In [13]:
# save the model to disk
filename = '../Model/LP_model_nostemstop.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [22]:
classifier_LP.fit(X_train_trans_full,y_train_trans_full)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [23]:
# save the model to disk
filename = '../Model/LP_model_trans_full.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [24]:
classifier_LP.fit(X_train_trans_nostopword,y_train_trans_nostopword)

LabelPowerset(classifier=RandomForestClassifier(bootstrap=False,
                                                class_weight=None,
                                                criterion='gini',
                                                max_depth=None,
                                                max_features='auto',
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_impurity_split=None,
                                                min_samples_leaf=1,
                                                min_samples_split=10,
                                                min_weight_fraction_leaf=0.0,
                                                n_estimators=800, n_jobs=-1,
                                                oob_score=False,
                                                random_state=None, verbose=0,
                

In [25]:
# save the model to disk
filename = '../Model/LP_model_trans_nostopword.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

In [None]:
classifier_LP.fit(X_train_trans_nostemstop,y_train_trans_nostemstop)

In [None]:
# save the model to disk
filename = '../Model/LP_model_trans_nostemstop.sav'
pickle.dump(classifier_LP, open(filename, 'wb'))

**3. Binary Relevance (for best scenario only)**

In [7]:
classifier_BR.fit(X_train_nostemstop,y_train_nostemstop)

BinaryRelevance(classifier=RandomForestClassifier(bootstrap=False,
                                                  class_weight=None,
                                                  criterion='gini',
                                                  max_depth=None,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=5,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=600, n_jobs=-1,
                                                  oob_score=False,
                                                  random_state=None, 

In [8]:
# save the model to disk
filename = '../Model/BR_model_nostemstop.sav'
pickle.dump(classifier_BR, open(filename, 'wb'))