In [1]:
import numpy as np
import pandas as pd
from feature_extraction import featurize
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, accuracy_score



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GL65\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
print(train_df.shape, test_df.shape)
embedding_type = 'tfidf_glove'


(291, 2) (45, 2)


In [4]:
train_features, test_features, feature_names = featurize(train_df, test_df, embedding_type)

Remove path from text....
Remove words within bracket....
Contraction....
Text Features....
Punctuation....
NGram Count....
Sentiment Scores....
Stopwords Counter....
TFIDF and Glove.....


In [5]:
print(train_features.toarray().shape)
print(test_features.toarray().shape)

(291, 63)
(45, 63)


In [6]:
label_encoder = preprocessing.LabelEncoder()

y_train = label_encoder.fit_transform(train_df.mitre_technique)
y_test = label_encoder.transform(test_df.mitre_technique)

### Evaluation Score 

In [7]:
def model_evaluation(model, test_features):
    #predict labels on test dataset
    y_pred_test = model.predict(test_features)

    average = "weighted"
    print("precision score  test dataset:\t", round(precision_score(y_test,y_pred_test, average=average)*100, 2), "%")
    print("Recall score  test dataset:\t", round(recall_score(y_test,y_pred_test, average=average)*100, 2), "%")
    print("f1 score  test dataset :\t", round(f1_score(y_test,y_pred_test, average=average)*100, 2), "%")
    print("Accuracy score  test dataset :\t", round(accuracy_score(y_test,y_pred_test)*100, 2), "%")


## LR

In [8]:
from sklearn.linear_model import LogisticRegression

In [61]:
# defining parameter range
param_grid = {'C': [0.1, 1, 5, 10, 20, 30],
            'penalty': ['l1', 'l2']}

grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(train_features, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END ...................C=0.1, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END .................C=0.1, penalty=l2;, score=0.119 total time=   0.0s
[CV 2/5] END .................C=0.1, penalty=l2;, score=0.155 total time=   0.0s
[CV 3/5] END .................C=0.1, penalty=l2;, score=0.172 total time=   0.1s
[CV 4/5] END .................C=0.1, penalty=l2;, score=0.190 total time=   0.0s
[CV 5/5] END .................C=0.1, penalty=l2;, score=0.138 total time=   0.0s
[CV 1/5] END .....................C=1, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END .....................C=1, penalty=l

In [62]:
# print best parameter after tuning
print(grid.best_params_)

{'C': 20, 'penalty': 'l2'}


In [9]:
#Building Model again with best params
LR_model=LogisticRegression(C=20,penalty="l2",)
LR_model.fit(train_features, y_train)


In [11]:
model_evaluation(LR_model, test_features)

precision score  test dataset:	 55.93 %
Recall score  test dataset:	 62.22 %
f1 score  test dataset :	 57.04 %
Accuracy score  test dataset :	 62.22 %


# SVM

In [12]:
from sklearn.svm import SVC

In [52]:
# defining parameter range
param_grid = {'C': [0.1, 1, 5, 10],
            'gamma': [1, 0.1, 0.01, 0.001],
            'kernel': ['rbf', 'linear']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(train_features, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.051 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.052 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.052 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.052 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.052 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.085 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.138 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.103 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.103 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.103 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.051 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf

[CV 3/5] END ....C=5, gamma=0.01, kernel=linear;, score=0.466 total time=   0.0s
[CV 4/5] END ....C=5, gamma=0.01, kernel=linear;, score=0.517 total time=   0.0s
[CV 5/5] END ....C=5, gamma=0.01, kernel=linear;, score=0.552 total time=   0.0s
[CV 1/5] END ......C=5, gamma=0.001, kernel=rbf;, score=0.051 total time=   0.0s
[CV 2/5] END ......C=5, gamma=0.001, kernel=rbf;, score=0.052 total time=   0.0s
[CV 3/5] END ......C=5, gamma=0.001, kernel=rbf;, score=0.052 total time=   0.0s
[CV 4/5] END ......C=5, gamma=0.001, kernel=rbf;, score=0.052 total time=   0.0s
[CV 5/5] END ......C=5, gamma=0.001, kernel=rbf;, score=0.052 total time=   0.0s
[CV 1/5] END ...C=5, gamma=0.001, kernel=linear;, score=0.441 total time=   0.0s
[CV 2/5] END ...C=5, gamma=0.001, kernel=linear;, score=0.466 total time=   0.0s
[CV 3/5] END ...C=5, gamma=0.001, kernel=linear;, score=0.466 total time=   0.0s
[CV 4/5] END ...C=5, gamma=0.001, kernel=linear;, score=0.517 total time=   0.0s
[CV 5/5] END ...C=5, gamma=0

In [53]:
# print best parameter after tuning
print(grid.best_params_)

{'C': 1, 'gamma': 1, 'kernel': 'linear'}


In [13]:
SVM_model = SVC(C=1, gamma=1, kernel='linear')
SVM_model.fit(train_features, y_train)

model_evaluation(SVM_model, test_features)

precision score  test dataset:	 50.74 %
Recall score  test dataset:	 57.78 %
f1 score  test dataset :	 52.44 %
Accuracy score  test dataset :	 57.78 %


# Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [1,2,4,5],
    'min_samples_leaf': [1,2,4,5],
    'max_leaf_nodes': [4,10,20,50,None]
}

grid = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid, refit = True, verbose = 3)

grid.fit(train_features, y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV 1/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=1, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=1, n_estimators=200;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=1, n_estimators=200;, score=nan tot

[CV 2/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.190 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.172 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.276 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.169 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.241 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.224 total time=   0.1s
[CV 4/5] END criterion=gini, max_l

[CV 2/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.224 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.138 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.241 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.220 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.224 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.259 total time=   0.1s
[CV 4/5] END criterion=gini, max_l

[CV 2/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.155 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.293 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.169 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.207 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.172 total time=   0.1s
[CV 4/5] END criterion=gini, max_l

[CV 2/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.310 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.310 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.310 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.322 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.328 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.293 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.259 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.241 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.431 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.237 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.276 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.259 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.293 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.310 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.224 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.271 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.328 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.345 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.310 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.310 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.322 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.328 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.310 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.310 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.288 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.345 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.310 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.339 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.328 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.466 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.448 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.356 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.362 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.328 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.448 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.373 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.414 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.328 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.466 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.431 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.322 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.379 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.414 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.466 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.483 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.390 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.448 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.362 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.431 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.390 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.379 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.362 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.424 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.345 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.1s
[CV 4/5] END criterion=gini

[CV 2/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.1s
[CV 4/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.431 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.448 total time=   0.1s
[CV 1/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.407 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.431 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.2s
[CV 4/5] END 

[CV 2/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.431 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.356 total time=   0.2s
[CV 2/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.431 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.379 total time=   0.2s
[CV 4/5] END 

[CV 2/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.466 total time=   0.1s
[CV 1/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.458 total time=   0.1s
[CV 2/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.431 total time=   0.1s
[CV 4/5] END 

[CV 2/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 3/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 4/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 5/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 1/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.339 total time=   0.2s
[CV 2/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.448 total time=   0.1s
[CV 3/5] END criterion=gini, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.1s
[CV 4/5] END 

[CV 2/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.155 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.155 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.172 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.169 total time=   0.1s
[CV 2/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.172 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.155 total time=   0.2s
[CV 4/5] END 

[CV 2/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.172 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.138 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.172 total time=   0.0s
[CV 5/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.241 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.186 total time=   0.1s
[CV 2/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.172 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.190 total time=   0.1s
[CV 4/5] END 

[CV 2/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.138 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.172 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.190 total time=   0.0s
[CV 5/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.190 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.169 total time=   0.1s
[CV 2/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.190 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.207 total time=   0.1s
[CV 4/5] END 

[CV 2/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.138 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.172 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.0s
[CV 5/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.155 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.119 total time=   0.1s
[CV 2/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.155 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=4, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.207 total time=   0.1s
[CV 4/5] END 

[CV 2/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.293 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.293 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.288 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.241 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.293 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.241 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.259 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.0s
[CV 5/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.293 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.305 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.259 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.310 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.207 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.293 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 5/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.293 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.271 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.276 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.328 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.276 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.271 total time=   0.1s
[CV 2/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.241 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=10, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.345 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.1s
[CV 4/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.305 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.293 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.362 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.276 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.431 total time=   0.1s
[CV 4/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.339 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.276 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.1s
[CV 4/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.322 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.345 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.431 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.276 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.0s
[CV 5/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.276 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.390 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.362 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.448 total time=   0.1s
[CV 4/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.339 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.397 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.448 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.1s
[CV 4/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.356 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.379 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.448 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.448 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.390 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.379 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.362 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.305 total time=   0.1s
[CV 2/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.379 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=50, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.448 total time=   0.2s
[CV 4/

[CV 2/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.1s
[CV 4/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.483 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.492 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.362 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.448 total time=

[CV 1/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.390 total time=   0.1s
[CV 2/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.1s
[CV 3/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.1s
[CV 4/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.431 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.431 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.322 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200;, score=0.362 total time=

[CV 2/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.414 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.397 total time=   0.0s
[CV 1/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.390 total time=   0.1s
[CV 2/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.362 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.397 total time=

[CV 2/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.328 total time=   0.0s
[CV 3/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.362 total time=   0.0s
[CV 4/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.379 total time=   0.1s
[CV 5/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.345 total time=   0.1s
[CV 1/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.339 total time=   0.2s
[CV 2/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.310 total time=   0.2s
[CV 3/5] END criterion=entropy, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, n_estimators=200;, score=0.379 total time=

In [19]:
print(grid.best_params_)

{'criterion': 'gini', 'max_leaf_nodes': 50, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 500}


In [15]:
RF_model = RandomForestClassifier(n_estimators=500,
                                  criterion='gini',
                                  min_samples_split=4,
                                  min_samples_leaf=2,
                                  max_leaf_nodes=50
                                 )
RF_model.fit(train_features, y_train)

model_evaluation(RF_model, test_features)

precision score  test dataset:	 30.07 %
Recall score  test dataset:	 42.22 %
f1 score  test dataset :	 33.74 %
Accuracy score  test dataset :	 42.22 %


# XGBooast

In [16]:
from xgboost import XGBClassifier

In [9]:
param_grid = {
    'n_estimators': [200, 500],
    'learning_rate': [0.01,0.05],
    'booster': ['gbtree'],
    'gamma': [0, 0.5, 1],
    'reg_alpha': [0, 0.5],
    'reg_lambda': [0.5, 1],
    'base_score': [0.2, 0.5]
}


grid = GridSearchCV(XGBClassifier(n_jobs=-1), param_grid, refit = True, verbose = 3, cv=KFold(n_splits=3))

grid.fit(train_features, y_train)

Fitting 3 folds for each of 96 candidates, totalling 288 fits
[CV 1/3] END base_score=0.2, booster=gbtree, gamma=0, learning_rate=0.01, n_estimators=200, reg_alpha=0, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.2, booster=gbtree, gamma=0, learning_rate=0.01, n_estimators=200, reg_alpha=0, reg_lambda=0.5;, score=0.258 total time=   2.1s
[CV 3/3] END base_score=0.2, booster=gbtree, gamma=0, learning_rate=0.01, n_estimators=200, reg_alpha=0, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.2, booster=gbtree, gamma=0, learning_rate=0.01, n_estimators=200, reg_alpha=0, reg_lambda=1;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.2, booster=gbtree, gamma=0, learning_rate=0.01, n_estimators=200, reg_alpha=0, reg_lambda=1;, score=0.247 total time=   2.2s
[CV 3/3] END base_score=0.2, booster=gbtree, gamma=0, learning_rate=0.01, n_estimators=200, reg_alpha=0, reg_lambda=1;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.2, boost

[CV 2/3] END base_score=0.2, booster=gbtree, gamma=0.5, learning_rate=0.01, n_estimators=200, reg_alpha=0.5, reg_lambda=0.5;, score=0.227 total time=   2.6s
[CV 3/3] END base_score=0.2, booster=gbtree, gamma=0.5, learning_rate=0.01, n_estimators=200, reg_alpha=0.5, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.2, booster=gbtree, gamma=0.5, learning_rate=0.01, n_estimators=200, reg_alpha=0.5, reg_lambda=1;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.2, booster=gbtree, gamma=0.5, learning_rate=0.01, n_estimators=200, reg_alpha=0.5, reg_lambda=1;, score=0.237 total time=   2.6s
[CV 3/3] END base_score=0.2, booster=gbtree, gamma=0.5, learning_rate=0.01, n_estimators=200, reg_alpha=0.5, reg_lambda=1;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.2, booster=gbtree, gamma=0.5, learning_rate=0.01, n_estimators=500, reg_alpha=0, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.2, booster=gbtree, gamma=0.5, learning_rate=0.01

[CV 2/3] END base_score=0.2, booster=gbtree, gamma=1, learning_rate=0.01, n_estimators=500, reg_alpha=0, reg_lambda=0.5;, score=0.216 total time=   6.3s
[CV 3/3] END base_score=0.2, booster=gbtree, gamma=1, learning_rate=0.01, n_estimators=500, reg_alpha=0, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.2, booster=gbtree, gamma=1, learning_rate=0.01, n_estimators=500, reg_alpha=0, reg_lambda=1;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.2, booster=gbtree, gamma=1, learning_rate=0.01, n_estimators=500, reg_alpha=0, reg_lambda=1;, score=0.216 total time=   6.4s
[CV 3/3] END base_score=0.2, booster=gbtree, gamma=1, learning_rate=0.01, n_estimators=500, reg_alpha=0, reg_lambda=1;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.2, booster=gbtree, gamma=1, learning_rate=0.01, n_estimators=500, reg_alpha=0.5, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.2, booster=gbtree, gamma=1, learning_rate=0.01, n_estimators=500, re

[CV 2/3] END base_score=0.5, booster=gbtree, gamma=0, learning_rate=0.01, n_estimators=500, reg_alpha=0.5, reg_lambda=1;, score=0.206 total time=   6.4s
[CV 3/3] END base_score=0.5, booster=gbtree, gamma=0, learning_rate=0.01, n_estimators=500, reg_alpha=0.5, reg_lambda=1;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.5, booster=gbtree, gamma=0, learning_rate=0.05, n_estimators=200, reg_alpha=0, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.5, booster=gbtree, gamma=0, learning_rate=0.05, n_estimators=200, reg_alpha=0, reg_lambda=0.5;, score=0.247 total time=   1.7s
[CV 3/3] END base_score=0.5, booster=gbtree, gamma=0, learning_rate=0.05, n_estimators=200, reg_alpha=0, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.5, booster=gbtree, gamma=0, learning_rate=0.05, n_estimators=200, reg_alpha=0, reg_lambda=1;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.5, booster=gbtree, gamma=0, learning_rate=0.05, n_estimators=200, 

[CV 2/3] END base_score=0.5, booster=gbtree, gamma=0.5, learning_rate=0.05, n_estimators=200, reg_alpha=0, reg_lambda=1;, score=0.227 total time=   2.2s
[CV 3/3] END base_score=0.5, booster=gbtree, gamma=0.5, learning_rate=0.05, n_estimators=200, reg_alpha=0, reg_lambda=1;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.5, booster=gbtree, gamma=0.5, learning_rate=0.05, n_estimators=200, reg_alpha=0.5, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.5, booster=gbtree, gamma=0.5, learning_rate=0.05, n_estimators=200, reg_alpha=0.5, reg_lambda=0.5;, score=0.216 total time=   2.5s
[CV 3/3] END base_score=0.5, booster=gbtree, gamma=0.5, learning_rate=0.05, n_estimators=200, reg_alpha=0.5, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.5, booster=gbtree, gamma=0.5, learning_rate=0.05, n_estimators=200, reg_alpha=0.5, reg_lambda=1;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.5, booster=gbtree, gamma=0.5, learning_rate=0.05, 

[CV 2/3] END base_score=0.5, booster=gbtree, gamma=1, learning_rate=0.05, n_estimators=200, reg_alpha=0.5, reg_lambda=1;, score=0.206 total time=   2.6s
[CV 3/3] END base_score=0.5, booster=gbtree, gamma=1, learning_rate=0.05, n_estimators=200, reg_alpha=0.5, reg_lambda=1;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.5, booster=gbtree, gamma=1, learning_rate=0.05, n_estimators=500, reg_alpha=0, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.5, booster=gbtree, gamma=1, learning_rate=0.05, n_estimators=500, reg_alpha=0, reg_lambda=0.5;, score=0.237 total time=   6.0s
[CV 3/3] END base_score=0.5, booster=gbtree, gamma=1, learning_rate=0.05, n_estimators=500, reg_alpha=0, reg_lambda=0.5;, score=nan total time=   0.0s
[CV 1/3] END base_score=0.5, booster=gbtree, gamma=1, learning_rate=0.05, n_estimators=500, reg_alpha=0, reg_lambda=1;, score=nan total time=   0.0s
[CV 2/3] END base_score=0.5, booster=gbtree, gamma=1, learning_rate=0.05, n_estimators=500, 

In [18]:
# print best parameter after tuning
print(grid.best_params_)

{'criterion': 'gini', 'max_leaf_nodes': 50, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 500}


In [17]:
XGB_model = XGBClassifier(base_score= 0.2, booster='gbtree', gamma = 0, learning_rate = 0.01, n_estimators= 200,
                          reg_alpha = 0, reg_lambda =  0.5)

XGB_model.fit(train_features, y_train)

In [18]:
model_evaluation(XGB_model, test_features)

precision score  test dataset:	 30.56 %
Recall score  test dataset:	 40.0 %
f1 score  test dataset :	 33.63 %
Accuracy score  test dataset :	 40.0 %


# Final Model
##### Final  model I am chosing LR as I got better accuracy

In [11]:
LR_model=LogisticRegression(C=20,penalty="l2",)
LR_model.fit(train_features, y_train)

In [12]:
#predict labels on test dataset
y_pred_test = LR_model.predict(test_features)

In [13]:
perdicted_mitre_technique = label_encoder.inverse_transform(y_pred_test)

In [14]:
test_df["perdicted_mitre_technique"] = perdicted_mitre_technique
test_df.head()

Unnamed: 0,attack_description,mitre_technique,perdicted_mitre_technique
0,Adversaries may abuse the Windows command she ...,T1059,T1059
1,Adversaries may use Valid Accounts to log into...,T1021,T1021
2,Adversaries may abuse mshta.exe to proxy execu...,T1218,T1218
3,Adversaries may use hidden windows to conceal ...,T1564,T1564
4,Adversaries may attempt to manipulate the name...,T1036,T1548


In [15]:
test_df.to_csv("test.csv", index=False)