Skip to content

silvvh/AttackDetectionAI-

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

10 Commits
 
 
 
 
 
 
 
 

Repository files navigation

Open In Colab

#bilbiotecas
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, make_scorer, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_svmlight_file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns

#IAs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
df = pd.read_csv("cybersecurity_intrusion_data.csv")
# Remover os espaços em branco do começo e do final dos nomes das colunas
df.columns = [i.strip() for i in df.columns]

# Mostrar um resumo da base de dados
df
session_id network_packet_size protocol_type login_attempts session_duration encryption_used ip_reputation_score failed_logins browser_type unusual_time_access attack_detected
0 SID_00001 599 TCP 4 492.983263 DES 0.606818 1 Edge 0 1
1 SID_00002 472 TCP 3 1557.996461 DES 0.301569 0 Firefox 0 0
2 SID_00003 629 TCP 3 75.044262 DES 0.739164 2 Chrome 0 1
3 SID_00004 804 UDP 4 601.248835 DES 0.123267 0 Unknown 0 1
4 SID_00005 453 TCP 5 532.540888 AES 0.054874 1 Firefox 0 0
... ... ... ... ... ... ... ... ... ... ... ...
9532 SID_09533 194 ICMP 3 226.049889 AES 0.517737 3 Chrome 0 1
9533 SID_09534 380 TCP 3 182.848475 NaN 0.408485 0 Chrome 0 0
9534 SID_09535 664 TCP 5 35.170248 AES 0.359200 1 Firefox 0 0
9535 SID_09536 406 TCP 4 86.664703 AES 0.537417 1 Chrome 1 0
9536 SID_09537 340 TCP 6 86.876744 NaN 0.277069 4 Chrome 1 1

9537 rows × 11 columns

#OneHotEncoder
transformers = [
    ('oh_protocol_type', OneHotEncoder(sparse_output=False), ['protocol_type']),
    ('oh_encryption_used', OneHotEncoder(sparse_output=False), ['encryption_used']),
    ('oh_browser_type',OneHotEncoder(sparse_output=False), ['browser_type'] )
]
ct_oh = ColumnTransformer(
    transformers, remainder='passthrough'
)

 # as características: todas as colunas exceto "attack_detected"
X = df.drop(["attack_detected", "session_id"], axis=1)
# y são os rótulos: apenas a coluna "attack_detected"
y = df["attack_detected"]

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, stratify=y, test_size=0.2, random_state=99)
X_treino, X_val, y_treino, y_val = train_test_split(X_treino, y_treino, stratify=y_treino, test_size=0.2, random_state=100)
def get_model_and_params(model_type):
    if model_type == 'svm':
        model = SVC()
        param_grid = {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf']
        }
    elif model_type == 'knn':
        model = KNeighborsClassifier()
        param_grid = {
            'model__n_neighbors': [3, 5, 7],
            'model__weights': ['uniform', 'distance'],
            'model__metric': ['euclidean', 'manhattan']
        }
    elif model_type == 'tree':
        model = DecisionTreeClassifier()
        param_grid = {
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5, 10]
        }
    elif model_type == 'mlp':
        model = MLPClassifier(max_iter=2000, solver='adam', learning_rate_init=0.001, early_stopping=True)
        param_grid = {
           'model__hidden_layer_sizes': [(50,), (100, 50), (100, 100, 100)],
            'model__activation': ['relu', 'tanh']
        }
    else:
        raise ValueError("Modelo desconhecido")
    return model, param_grid

# Escolher modelo (altere aqui)
model_type = 'svm'  # Opções: 'svm', 'knn', 'tree', 'mlp'
model, param_grid = get_model_and_params(model_type)

# Criar pipeline
pipeline = Pipeline([
    ('preprocessing', ct_oh),
    ('scaler', StandardScaler()),
    ('model', model)
])

# GridSearchCV
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average='weighted')

grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring=f1_scorer, n_jobs=-1)
grid_search.fit(X_treino, y_treino)

# Avaliação no conjunto de validação
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
f1_val = f1_score(y_val, y_val_pred, average='weighted')

print("Melhores hiperparâmetros encontrados:")
print(grid_search.best_params_)
print(f"F1-score no conjunto de validação: {f1_val:.4f}")

# Avaliação final no conjunto de teste
y_test_pred = best_model.predict(X_teste)
f1_test = f1_score(y_teste, y_test_pred, average='weighted')

print(f"F1-score no conjunto de teste: {f1_test:.4f}")
Melhores hiperparâmetros encontrados:
{'model__C': 10, 'model__kernel': 'rbf'}
F1-score no conjunto de validação: 0.8645
F1-score no conjunto de teste: 0.8607
#MÉTRICAS
# Usando F1-score como métrica
f1_scorer = make_scorer(f1_score, average='weighted')  # 'weighted' para lidar com classes desbalanceadas

# Avaliação com cross_val_score usando F1-score
scores = cross_val_score(pipeline, X, y, cv=kf, scoring=f1_scorer)

# Resultados
print(f"F1-score médio (cross-validation): {scores.mean():.4f} ± {scores.std():.4f}")

# Treinando o pipeline com os dados de treino
pipeline.fit(X_treino, y_treino)

# Fazendo previsões no conjunto de teste
y_pred = pipeline.predict(X_teste)

# Acurácia
accuracy = accuracy_score(y_teste, y_pred)
print(f"Acurácia: {accuracy:.4f}")

# Métricas de classificação: F1-Score, Precision, Recall
print("Métricas de Classificação:")
print(classification_report(y_teste, y_pred))
F1-score médio (cross-validation): 0.8590 ± 0.0094
Acurácia: 0.8443
Métricas de Classificação:
              precision    recall  f1-score   support

           0       0.81      0.94      0.87      1055
           1       0.91      0.72      0.81       853

    accuracy                           0.84      1908
   macro avg       0.86      0.83      0.84      1908
weighted avg       0.85      0.84      0.84      1908
# Gerando a matriz de confusão
cm = confusion_matrix(y_teste, y_pred)

# Plotando a matriz de confusão com os valores
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Classe 0', 'Classe 1'], yticklabels=['Classe 0', 'Classe 1'])
plt.title('Matriz de Confusão com Valores')
plt.xlabel('Classes Preditas')
plt.ylabel('Classes Reais')
plt.show()

About

A predictive model for cyberattack detection leveraging user behavior analysis and network metrics, including protocols and packet size

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

 
 
 

Contributors