In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearnex import patch_sklearn, unpatch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## UNSW-NB15

In [3]:
# Load Data
train_raw = pd.read_csv('../Data/UNSW_NB15/train.csv')
print(train_raw.shape)
test_raw = pd.read_csv('../Data/UNSW_NB15/test.csv')
print(test_raw.shape)
# Seperate label and drop unnecessary features
train_X = train_raw.drop(['id', 'attack_cat', 'label'], axis=1).select_dtypes(include='number')
print(train_X.shape)
train_Y = train_raw['label']
test_X = test_raw.drop(['id', 'attack_cat', 'label'], axis=1).select_dtypes(include='number')
print(test_X.shape)
test_Y = test_raw['label']

(82332, 45)
(175341, 45)
(82332, 39)
(175341, 39)


In [4]:
# Normalize data with min, max of training data
test_X1 = (test_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))
train_X1 = (train_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))

test_X1[test_X1 < 0] = 0
test_X1[test_X1 > 1] = 1

train_X1.fillna(0, inplace=True)
test_X1.fillna(0, inplace=True)

del train_X, test_X

## Random Forest

In [9]:
# hyperparameters to search
n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

grid = dict(n_estimators=n_estimators)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X1, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.980176 using {'n_estimators': 2000} with all features


KeyboardInterrupt: 

In [11]:
# hyperparameters to search
max_features = ['auto', 'sqrt']

grid = dict(max_features=max_features)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=2000)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X1, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.980176 using {'max_features': 'auto'} with all features


In [12]:
# hyperparameters to search
max_features = grid_result.best_params_['max_features']
max_depth = [int(x) for x in np.linspace(10, 110, 11)]

grid = dict(max_depth=max_depth)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=2000, max_features=max_features)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X1, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.980176 using {'max_depth': 60} with all features


In [13]:
# hyperparameters to search
max_depth = grid_result.best_params_['max_depth']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

grid = dict(min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=2000, max_features=max_features, max_depth=max_depth)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X1, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.980176 using {'min_samples_leaf': 1, 'min_samples_split': 2} with all features


In [14]:
min_samples_split = grid_result.best_params_['min_samples_split']
min_samples_leaf = grid_result.best_params_['min_samples_leaf']
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, 
                                  n_estimators=2000, max_features=max_features, max_depth=max_depth, 
                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
model_rf.fit(train_X1, train_Y)
predict = model_rf.predict(test_X1)
print(f1_score(test_Y, predict))

0.9198814650348711


## Correlation Selection

In [6]:
# correlation based feature selection
corr = train_X1.corr().abs()

correlation_threshold = 0.8
corr.values[np.tril_indices_from(corr.values)] = np.nan
redundant = []
for j in corr.columns:
    for i in corr.index:
        if corr.loc[i, j] > correlation_threshold:
            redundant.append((i, j))

train_corr = train_X1.copy()
train_corr['Label'] = train_Y
corr2 = train_corr.corr().abs()

corr3 = corr2['Label'].iloc[:-1].copy()
drop = []

# drop features having lower correlation with label
for i, j in redundant:
    if corr3[i] > corr3[j]:
        if j not in drop:
            drop.append(j)
    elif i not in drop:
        drop.append(i)
print(drop)

del corr, train_corr, corr2, corr3

train_X2 = train_X1.drop(drop, axis=1)
print(train_X2.shape)
test_X2 = test_X1.drop(drop, axis=1)
print(test_X2.shape)

['sbytes', 'dbytes', 'sloss', 'dloss', 'dwin', 'tcprtt', 'ackdat', 'ct_dst_ltm', 'ct_srv_src', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']
(82332, 24)
(175341, 24)


In [17]:
n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]
grid = dict(n_estimators=n_estimators)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

n_estimators = grid_result.best_params_['n_estimators']
max_features = ['auto', 'sqrt']
grid = dict(max_features=max_features)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

max_features = grid_result.best_params_['max_features']
max_depth = [int(x) for x in np.linspace(10, 110, 11)]

grid = dict(max_depth=max_depth)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=n_estimators, max_features=max_features)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

max_depth = grid_result.best_params_['max_depth']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

grid = dict(min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

min_samples_split = grid_result.best_params_['min_samples_split']
min_samples_leaf = grid_result.best_params_['min_samples_leaf']
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, 
                                  n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, 
                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
model_rf.fit(train_X2, train_Y)
predict = model_rf.predict(test_X2)
print(f1_score(test_Y, predict))

Best: 0.944857 using {'n_estimators': 2000} with all features
Best: 0.944857 using {'max_features': 'auto'} with all features
Best: 0.944896 using {'max_depth': 40} with all features


In [8]:
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

grid = dict(min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=2000, max_features='auto', max_depth=40)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

min_samples_split = grid_result.best_params_['min_samples_split']
min_samples_leaf = grid_result.best_params_['min_samples_leaf']
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, 
                                  n_estimators=2000, max_features='auto', max_depth=40, 
                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
model_rf.fit(train_X2, train_Y)
predict = model_rf.predict(test_X2)
print(f1_score(test_Y, predict))

Best: 0.944966 using {'min_samples_leaf': 1, 'min_samples_split': 2} with all features
0.9289842632331903


In [None]:
n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]
grid = dict(n_estimators=n_estimators)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

n_estimators = grid_result.best_params_['n_estimators']
max_features = ['auto', 'sqrt']
grid = dict(max_features=max_features)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

max_features = grid_result.best_params_['max_features']
max_depth = [int(x) for x in np.linspace(10, 110, 11)]

grid = dict(max_depth=max_depth)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=n_estimators, max_features=max_features)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

max_depth = grid_result.best_params_['max_depth']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

grid = dict(min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf)
# grid search
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth)
cv = RepeatedStratifiedKFold(random_state=0)
grid_search = GridSearchCV(estimator=model_rf, param_grid=grid, scoring='f1', cv=cv, error_score=0)
grid_result = grid_search.fit(train_X2, train_Y)
# get the best result
print("Best: %f using %s with all features" % (grid_result.best_score_, grid_result.best_params_))

min_samples_split = grid_result.best_params_['min_samples_split']
min_samples_leaf = grid_result.best_params_['min_samples_leaf']
model_rf = RandomForestClassifier(random_state=0, n_jobs=-1, 
                                  n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, 
                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
model_rf.fit(train_X2, train_Y)
predict = model_rf.predict(test_X2)
print(f1_score(test_Y, predict))