In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
###### Main Module
#####
###########################
###### Packages
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
from tabulate import tabulate

import time
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

# Algos
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

###########################

# Experimentos com ML tradicional

In [3]:
np.random.seed(100)

In [4]:
features_file=pd.read_csv("/content/drive/MyDrive/IntrusionDetection/CICIDS2017/02-TreatedDataset/CIC2017_features.csv",header=None).values

In [5]:
labels_file=pd.read_csv("/content/drive/MyDrive/IntrusionDetection/CICIDS2017/02-TreatedDataset/CIC2017_labels.csv",header=None).values.ravel()

In [6]:
x_train, x_test, y_train, y_test  = train_test_split(features_file,labels_file, test_size=0.3, random_state=100, stratify=labels_file)


In [7]:
def print_shapes(x_train, x_test, y_train, y_test):
    print("Shape de x_train:", x_train.shape)
    print("Shape de x_test:", x_test.shape)
    print("Shape de y_train:", y_train.shape)
    print("Shape de y_test:", y_test.shape)

# Chamando a função para imprimir os shapes
print_shapes(x_train, x_test, y_train, y_test)


Shape de x_train: (1981520, 77)
Shape de x_test: (849223, 77)
Shape de y_train: (1981520,)
Shape de y_test: (849223,)


In [8]:
print(len(np.unique(y_train)))

15


In [9]:
print((np.unique(y_train)))

['BENIGN' 'Bot' 'DDoS' 'DoS GoldenEye' 'DoS Hulk' 'DoS Slowhttptest'
 'DoS slowloris' 'FTP-Patator' 'Heartbleed' 'Infiltration' 'PortScan'
 'SSH-Patator' 'Web Attack � Brute Force' 'Web Attack � Sql Injection'
 'Web Attack � XSS']


## RF

In [24]:
# Tempo para ajustar o modelo

inicio = time.time()

clf = RandomForestClassifier(n_estimators=10, random_state=1, verbose=0).fit(x_train, y_train)

fim = time.time()
tempo_total = fim - inicio
print(f"Tempo de execução: {tempo_total} segundos")

Tempo de execução: 121.95485734939575 segundos


In [25]:
# Tempo para predição

inicio = time.time()

acc= balanced_accuracy_score(y_test, clf.predict(x_test))

fim = time.time()
tempo_total = fim - inicio
print(acc)
print(f"Tempo de execução: {tempo_total} segundos")

0.8089674778234054
Tempo de execução: 4.133983135223389 segundos


## Apenas amostras limitadas ao ponto de corte definido

In [9]:
data1 = np.load('/content/drive/MyDrive/IntrusionDetection/CICIDS2017/03-Experiments/2024-06-17/CIC2017-EN-s100-15p.npy', allow_pickle=True).item()
data2 = np.load('/content/drive/MyDrive/IntrusionDetection/CICIDS2017/03-Experiments/2024-06-17/CIC2017-EN-s200-15p.npy', allow_pickle=True).item()
data3 = np.load('/content/drive/MyDrive/IntrusionDetection/CICIDS2017/03-Experiments/2024-06-17/CIC2017-EN-s300-15p.npy', allow_pickle=True).item()
data4 = np.load('/content/drive/MyDrive/IntrusionDetection/CICIDS2017/03-Experiments/2024-06-17/CIC2017-EN-s400-15p.npy', allow_pickle=True).item()
data5 = np.load('/content/drive/MyDrive/IntrusionDetection/CICIDS2017/03-Experiments/2024-06-17/CIC2017-EN-s500-15p.npy', allow_pickle=True).item()

In [134]:
data = data1

In [5]:
data1[150030]

{'querying_time': [3.4786675000796095],
 'classification_time': [6.259936099988408],
 'accuracy': [0.9985433743551458],
 'balanced_accuracy': [0.8119526911235802],
 'selected_indices': [[41937,
   454697,
   1001517,
   1133624,
   18417,
   1195505,
   1309770,
   507136,
   317322,
   1207898,
   188016,
   1329551,
   1354872,
   1502785,
   543613,
   1322820,
   1299333,
   971228,
   77439,
   1329662,
   1421974,
   713407,
   377750,
   107159,
   1614063,
   764443,
   957261,
   1279510,
   1422125,
   252235,
   1814545,
   1116604,
   543756,
   1445579,
   971541,
   971551,
   1196133,
   401405,
   1755635,
   136480,
   77064,
   1196158,
   814540,
   136465,
   1134637,
   305264,
   876167,
   1474803,
   1638856,
   1052926,
   1279756,
   390181,
   1388257,
   758118,
   294767,
   1445317,
   947740,
   76889,
   1355910,
   1422577,
   360303,
   1355952,
   663210,
   1102346,
   728712,
   1109991,
   1523130,
   1668229,
   480255,
   1724903,
   1803784,
   

In [10]:
querying_time1 = []
querying_time2 = []
querying_time3 = []
querying_time4 = []
querying_time5 = []
classification_time1 = []
classification_time2 = []
classification_time3 = []
classification_time4 = []
classification_time5 = []

for key, value in data1.items():
    if isinstance(key, int) and 'balanced_accuracy' in value:
        querying_time1.append(value['querying_time'][0])
        classification_time1.append(value['classification_time'][0])

for key, value in data2.items():
    if isinstance(key, int) and 'balanced_accuracy' in value:
        querying_time2.append(value['querying_time'][0])
        classification_time2.append(value['classification_time'][0])

for key, value in data3.items():
    if isinstance(key, int) and 'balanced_accuracy' in value:
        querying_time3.append(value['querying_time'][0])
        classification_time3.append(value['classification_time'][0])

for key, value in data4.items():
    if isinstance(key, int) and 'balanced_accuracy' in value:
        querying_time4.append(value['querying_time'][0])
        classification_time4.append(value['classification_time'][0])

for key, value in data5.items():
    if isinstance(key, int) and 'balanced_accuracy' in value:
        querying_time5.append(value['querying_time'][0])
        classification_time5.append(value['classification_time'][0])

print("querying_time médio:", (sum(querying_time1[0:300]) + \
                               sum(querying_time2[0:300]) + \
                               sum(querying_time3[0:300]) + \
                               sum(querying_time4[0:300]) + \
                               sum(querying_time5[0:300]) )/5/60)

print("querying_time desvio:" , np.std(np.array([sum(querying_time1[0:300])/60 , \
                                    sum(querying_time2[0:300])/60 , \
                                    sum(querying_time3[0:300])/60 , \
                                    sum(querying_time4[0:300])/60 , \
                                    sum(querying_time5[0:300])/60 ])))

print("query time iteration média:",(  data1[150030]['querying_time'][0] + \
                                    data2[150030]['querying_time'][0] + \
                                    data3[150030]['querying_time'][0] + \
                                    data4[150030]['querying_time'][0] + \
                                    data5[150030]['querying_time'][0] )/5)

print("query time iteration desvio:" , np.std(np.array([ data1[150030]['querying_time'][0] , \
                                    data2[150030]['querying_time'][0] , \
                                    data3[150030]['querying_time'][0] , \
                                    data4[150030]['querying_time'][0] , \
                                    data5[150030]['querying_time'][0] ])))

print("classification_time médio:",( sum(classification_time1[0:300]) + \
                                    sum(classification_time2[0:300]) + \
                                    sum(classification_time3[0:300]) + \
                                    sum(classification_time4[0:300]) + \
                                    sum(classification_time5[0:300]) )/5/60)

print("classification_time desvio:", np.std(np.array([sum(classification_time1[0:300])/60 , \
                                    sum(classification_time2[0:300])/60 , \
                                    sum(classification_time3[0:300])/60 , \
                                    sum(classification_time4[0:300])/60 , \
                                    sum(classification_time5[0:300])/60 ])))

print("classification_time iteration média:",(  data1[150030]['classification_time'][0] + \
                                    data2[150030]['classification_time'][0] + \
                                    data3[150030]['classification_time'][0] + \
                                    data4[150030]['classification_time'][0] + \
                                    data5[150030]['classification_time'][0] )/5)

print("classification_time iteration desvio:" , np.std(np.array([ data1[150030]['classification_time'][0] , \
                                    data2[150030]['classification_time'][0] , \
                                    data3[150030]['classification_time'][0] , \
                                    data4[150030]['classification_time'][0] , \
                                    data5[150030]['classification_time'][0] ])))

print("Classes média:",(            data1[150030]['classes_know'][0] + \
                                    data2[150030]['classes_know'][0] + \
                                    data3[150030]['classes_know'][0] + \
                                    data4[150030]['classes_know'][0] + \
                                    data5[150030]['classes_know'][0] )/5)

print("Classes desvio:" , np.std(np.array([ data1[150030]['classes_know'][0] , \
                                    data2[150030]['classes_know'][0] , \
                                    data3[150030]['classes_know'][0] , \
                                    data4[150030]['classes_know'][0] , \
                                    data5[150030]['classes_know'][0] ])))

print("Acurácia média:",(           data1[150030]['accuracy'][0] + \
                                    data2[150030]['accuracy'][0] + \
                                    data3[150030]['accuracy'][0] + \
                                    data4[150030]['accuracy'][0] + \
                                    data5[150030]['accuracy'][0] )/5)

print("Acurácia desvio:" , np.std(np.array([ data1[150030]['accuracy'][0] , \
                                    data2[150030]['accuracy'][0] , \
                                    data3[150030]['accuracy'][0] , \
                                    data4[150030]['accuracy'][0] , \
                                    data5[150030]['accuracy'][0] ])))

print("balanced accuracy média:",(  data1[150030]['balanced_accuracy'][0] + \
                                    data2[150030]['balanced_accuracy'][0] + \
                                    data3[150030]['balanced_accuracy'][0] + \
                                    data4[150030]['balanced_accuracy'][0] + \
                                    data5[150030]['balanced_accuracy'][0] )/5)

print("balanced accuracy desvio:" , np.std(np.array([ data1[150030]['balanced_accuracy'][0] , \
                                    data2[150030]['balanced_accuracy'][0] , \
                                    data3[150030]['balanced_accuracy'][0] , \
                                    data4[150030]['balanced_accuracy'][0] , \
                                    data5[150030]['balanced_accuracy'][0] ])))


# Definir as classes únicas esperadas
unique_classes = ['BENIGN', 'Bot', 'DDoS', 'DoS GoldenEye', 'DoS Hulk',
                  'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator',
                  'Heartbleed', 'Infiltration', 'PortScan', 'SSH-Patator',
                  'Web Attack � Brute Force', 'Web Attack � Sql Injection',
                  'Web Attack � XSS']

# Inicializar um dicionário para armazenar as contagens de todas as classes
class_counts_all = {cls: 0 for cls in unique_classes}

# Concatenar todos os vetores em um único array
all_classes = np.concatenate((
    data1[150030]['classes'][0],
    data2[150030]['classes'][0],
    data3[150030]['classes'][0],
    data4[150030]['classes'][0],
    data5[150030]['classes'][0]
))

# Contar as ocorrências de cada classe encontrada
unique_classes_found, class_counts_found = np.unique(all_classes, return_counts=True)

# Atualizar o dicionário com as contagens encontradas
for cls, count in zip(unique_classes_found, class_counts_found):
    if cls in class_counts_all:
        class_counts_all[cls] = count

# Verificar e adicionar zero para classes não encontradas
for cls in unique_classes:
    if cls not in class_counts_all:
        class_counts_all[cls] = 0

# Preparar os dados para a tabela
table_data = []
for cls in unique_classes:
    table_data.append([cls, class_counts_all[cls]])

# Imprimir a tabela usando tabulate
print(tabulate(table_data, headers=['Classe', 'Frequência'], tablefmt='grid'))

querying_time médio: 19.709712779338396
querying_time desvio: 0.49385077683875567
query time iteration média: 3.8520331400213763
query time iteration desvio: 0.3299351225617952
classification_time médio: 11.768705651000296
classification_time desvio: 0.6886675374718828
classification_time iteration média: 5.869737900001928
classification_time iteration desvio: 0.5275025011948499
Classes média: 13.0
Classes desvio: 0.8944271909999159
Acurácia média: 0.998542432317542
Acurácia desvio: 4.837067266409835e-05
balanced accuracy média: 0.8282426608037421
balanced accuracy desvio: 0.016194247348376087
+----------------------------+--------------+
| Classe                     |   Frequência |
| BENIGN                     |            5 |
+----------------------------+--------------+
| Bot                        |            5 |
+----------------------------+--------------+
| DDoS                       |            5 |
+----------------------------+--------------+
| DoS GoldenEye              | 