In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [None]:
datalocation = './datasets.uwf.edu/data/UWF-ZeekData22/'

cols = ['resp_pkts', 
        'service', 
        'orig_ip_bytes', 
        'local_resp', 
        'missed_bytes',
        'protocol', 
        'duration', 
        'conn_state', 
        'dest_ip', 
        'orig_pkts',
        'community_id', 
        'resp_ip_bytes', 
        'dest_port', 
        'orig_bytes',
        'local_orig', 
        'datetime', 
        'history', 
        'resp_bytes', 
        'uid', 
        'src_port',
        'ts', 
        'src_ip', 
        'mitre_attack_tactics'
        ]

df = pd.DataFrame(columns=cols)

for root, dirs, files in os.walk(datalocation):
    for name in files:
        if name.endswith('.csv'):
          dfi = pd.read_csv(os.path.join(root, name))       
        elif name.endswith('.parquet'):
          dfi = pd.read_parquet(os.path.join(root, name), engine='pyarrow')
        else:
          continue
        if dfi.shape[1] != 23:
          print("wrong shape: ", os.path.join(name))
          continue
        print(os.path.join(name))
        df =  pd.DataFrame(np.concatenate([df.values, dfi.values]), columns=cols)
print(df.shape)
        

In [None]:
select = ['resp_pkts', 'service', 'orig_ip_bytes', 'local_resp', 'missed_bytes', 'protocol', 'duration', 'conn_state', 'orig_pkts', 'resp_ip_bytes', 'dest_port', 'orig_bytes', 'local_orig', 'resp_bytes', 'src_port', 'mitre_attack_tactics']

X = df[select].copy()

y = pd.DataFrame(X.mitre_attack_tactics)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for i in ['service', 'local_resp', 'protocol', 'conn_state', 'local_orig', 'mitre_attack_tactics']:
    X[i] = le.fit_transform(X[i])
y = le.transform(y)

In [None]:
X.fillna(0, inplace=True)

In [None]:
#scale numerical values

from sklearn.preprocessing import StandardScaler

s = StandardScaler()

X = s.fit_transform(X)

X = pd.DataFrame(X, columns=[select])

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=8, random_state=0) 
kmeans.fit(X)
labels = kmeans.labels_
correct_labels = sum(y == labels)

print('K-means correctly labeled: ', round(correct_labels/y.size, 2))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print('Naïve Bayes correctly classified: ', round((y_test == y_pred).sum()/X_test.shape[0], 2))

In [None]:
from sklearn.neural_network import MLPRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
mlp = MLPRegressor(random_state = 1)

y_pred = mlp.fit(X_train, y_train).predict(X_test)

print('MLP correctly classified: ', round((y_test == y_pred).sum()/X_test.shape[0], 2))