# Random Forest Classifier

In [18]:
import pandas as pd
import numpy as np
import sys
sys.path.append("..")
from Functions.UNSW_DF import *

# importing random forest classifier from assemble module
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
X_train, X_test, y_train, y_test = DF_XY()

# importing Dataset
train, test = DF_preprocessed_traintest()

( 1 ) Reading Preprocessed CSV files..
	 Training dataset loaded..
	 Testing dataset loaded..

( 2 ) Loading done, splitting into X and Y..
	 ( 2.1 ) x_train Shape:  	 (175341, 53)
	 ( 2.2 ) y_train Shape:  	 (175341,)
	 ( 2.3 ) x_test Shape:  	 (82332, 53)
	 ( 2.4 ) y_test Shape:  	 (82332,)
( 3 ) Done!
PS! Import with: x_train, x_test, y_train, y_test = XY_import()
Reading Preprocessed CSV Files..
	 Train Shape:  	 (175341, 54)
	 Test Shape:  	 (82332, 54)
Dataset Loaded!


In [2]:
#dataframe = pd.merge(train, test)
#X = dataframe.drop(["label"], axis=1)
#y= dataframe["label"]

## Creating the classifier

In [None]:
params = {"criterion": "entropy",
              "bootstrap":True, 
              "n_estimators": 200,
              "max_depth": 50,
              "min_samples_split":10,
              "min_samples_leaf": 2,
              "n_jobs": -1}

In [None]:
# define the model
model = RandomForestClassifier(**params)
model.set_params(**params)

# fit the model on the whole dataset
model.fit(X, y)

# performing predictions on the traing and test dataset
y_pred_train = model.predict(X)
y_pred_test = model.predict(X_test)

In [None]:
train_accuracy = round(metrics.accuracy_score(y, y_pred_train), 5)
test_accuracy = round(metrics.accuracy_score(y_test, y_pred_test), 5)
f1 = round(metrics.f1_score(y_test, y_pred_test), 5)
precision = round(metrics.precision_score(y_test, y_pred_test), 5)
recall = round(metrics.recall_score(y_test, y_pred_test), 5)

print(f"Training accuracy: \t{train_accuracy}\nTest accuracy: \t\t{test_accuracy}\nF1-score: \t\t{f1}\nprecision-score: \t{precision}\nrecall-score: \t\t{recall}\n")

## Classifier Experiment 1

In [3]:
# evaluate random forest algorithm for classification
from numpy import mean
from numpy import std
import time

In [4]:
params = {"criterion": "entropy",
              "bootstrap":True, 
              "n_estimators": 200,
              "max_depth": 50,
              "min_samples_split":10,
              "min_samples_leaf": 2,
              "n_jobs": -1}

In [5]:
# define the model
#model = RandomForestClassifier(n_jobs=-1)

# define the model
model = RandomForestClassifier(**params)
model.set_params(**params)

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, 
                             n_repeats=3, 
                             random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

In [6]:
n_scores

array([0.96116339, 0.96030569, 0.96087601, 0.9629862 , 0.95910802,
       0.96161743, 0.96047679, 0.95791035, 0.95916505, 0.96184556,
       0.96287425, 0.96213072, 0.96002053, 0.96030569, 0.95876583,
       0.96247291, 0.95876583, 0.95967834, 0.96047679, 0.96024866,
       0.95939549, 0.96104711, 0.95813847, 0.96207369, 0.95819551,
       0.96041976, 0.96241588, 0.96144633, 0.96144633, 0.96247291])

In [7]:
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.961 (0.001)


# Classifier Experiment 2

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [None]:
# define the model
model = RandomForestClassifier(n_jobs=-1)

In [None]:
scores = cross_validate(model, X, y, cv=30,
                        scoring=('accuracy', 'precision', 'recall', 'f1'),
                        return_train_score=True,
                        verbose=1)

In [None]:
scores

In [None]:
df = pd.DataFrame.from_dict(scores, orient='columns')
df

# CROSS VAL 3