In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [47]:
# Datasets available in
# https://www.unsw.adfa.edu.au/australian-centre-for-cyber-security/cybersecurity/ADFA-NB15-Datasets/

train_data = pd.read_csv('data/UNSW_NB15_training-set.csv', index_col='id')
test_data = pd.read_csv('data/UNSW_NB15_testing-set.csv', index_col='id')

In [18]:
train_data.head()

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,254,...,1,2,0,0,0,1,2,0,Normal,0
2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,254,...,1,2,0,0,0,1,2,0,Normal,0
3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,254,...,1,3,0,0,0,1,3,0,Normal,0
4,6e-06,udp,-,INT,2,0,900,0,166666.6608,254,...,1,3,0,0,0,2,3,0,Normal,0
5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,254,...,1,3,0,0,0,2,3,0,Normal,0


In [19]:
train_data.attack_cat.unique()

array(['Normal', 'Reconnaissance', 'Backdoor', 'DoS', 'Exploits',
       'Analysis', 'Fuzzers', 'Worms', 'Shellcode', 'Generic'], dtype=object)

In [53]:
train_data.label.unique()

array([0, 1], dtype=int64)

In [54]:
le = LabelEncoder()
data_sets = [train_data, test_data]
for data_set in data_sets:
    cols = ['proto', 'service', 'state', 'attack_cat']
    for col in cols:
        data_set[col] = le.fit_transform(data_set[col])

In [55]:
train_data.attack_cat.unique()

array([6, 7, 1, 2, 3, 0, 4, 9, 8, 5], dtype=int64)

In [56]:
train_feature_cols = list(set(data.columns) - set(['attack_cat', 'label']))

In [57]:
rf = RandomForestClassifier()

In [58]:
rf.fit(train_data[feature_cols], train_data.label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [59]:
rf.score(test_data[feature_cols], test_data.label)

0.89724023474258729

In [62]:
test_data['predicted'] = rf.predict(test_data[feature_cols])

### Comparison using gridsearch

In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

In [68]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200], 'weights':[[2,5,1],[3,4,1]]}


In [73]:
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(train_data[feature_cols], train_data.label)
grid.best_params_

{'lr__C': 1.0, 'rf__n_estimators': 200, 'weights': [2, 5, 1]}

In [74]:
from sklearn.model_selection import cross_val_score
cross_val_score(grid.best_estimator_, train_data[feature_cols], train_data.label , cv=5)

array([ 0.94309832,  0.99240906,  0.94928944,  0.92596866,  0.95141504])