In [25]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

path = os.path.join("X-IIoTID dataset.csv")  
try:
    df = pd.read_csv(path, low_memory=False)
    print(f"✅ Datos cargados: {df.shape[0]} filas, {df.shape[1]} columnas.")
except FileNotFoundError:
    print("❌ Error: No se encontró el archivo.")

✅ Datos cargados: 820834 filas, 68 columnas.


In [26]:
random_state = 42

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820834 entries, 0 to 820833
Data columns (total 68 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Date                         820503 non-null  object
 1   Timestamp                    820537 non-null  object
 2   Scr_IP                       820834 non-null  object
 3   Scr_port                     820834 non-null  object
 4   Des_IP                       820834 non-null  object
 5   Des_port                     820834 non-null  object
 6   Protocol                     820834 non-null  object
 7   Service                      820834 non-null  object
 8   Duration                     820834 non-null  object
 9   Scr_bytes                    820834 non-null  object
 10  Des_bytes                    820834 non-null  object
 11  Conn_state                   820834 non-null  int64 
 12  missed_bytes                 820834 non-null  object
 13  is_syn_only   

In [28]:
from sklearn.model_selection import train_test_split

"""Divide los datos en entrenamiento, validación y prueba."""
X = df.drop(columns=['class1', 'class2', 'class3'])
y_class3 = df['class3'].map({'Normal': 0, 'Attack': 1})
y_class2 = df['class2']
y_class1 = df['class1']

X_train, X_temp, y_train_class3, y_temp_class3, y_train_class2, y_temp_class2, y_train_class1, y_temp_class1 = train_test_split(
    X, y_class3, y_class2, y_class1, test_size=0.3, random_state=random_state, stratify=y_class3, shuffle=True
)

X_val, X_test, y_val_class3, y_test_class3, y_val_class2, y_test_class2, y_val_class1, y_test_class1 = train_test_split(
    X_temp, y_temp_class3, y_temp_class2, y_temp_class1, test_size=0.5, random_state=random_state, stratify=y_temp_class3, shuffle=True
)

# Resetear índices para evitar desalineaciones
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)  # Opcional

y_train_class3 = y_train_class3.reset_index(drop=True)
y_val_class3 = y_val_class3.reset_index(drop=True)
y_test_class3 = y_test_class3.reset_index(drop=True)  # Opcional

y_train_class2 = y_train_class2.reset_index(drop=True)
y_val_class2 = y_val_class2.reset_index(drop=True)
y_test_class2 = y_test_class2.reset_index(drop=True)  # Opcional

y_train_class1 = y_train_class1.reset_index(drop=True)
y_val_class1 = y_val_class1.reset_index(drop=True)
y_test_class1 = y_test_class1.reset_index(drop=True)  # Opcional

In [29]:
import pandas as pd
def fix_dtype(df, umbral_numerico=0.7):
    object_cols = df.select_dtypes(include=['object']).columns
    int_cols = df.select_dtypes(include=['int64']).columns
    bool_cols = df.select_dtypes(include=['bool']).columns

    # Convertir booleanos a float
    df[bool_cols] = df[bool_cols].astype(float)

    for col in object_cols:
        valores_unicos = df[col].dropna().unique()

        if {"true", "false"} <= set(valores_unicos):  # Verifica si ambos existen
            df[col] = df[col].map({'true': 1, 'false': 0}).astype(float)
        else:
            converted = pd.to_numeric(df[col], errors='coerce')
            if converted.notna().mean() > umbral_numerico:
                df[col] = converted.astype(float)

    for col in int_cols:
        df[col] = df[col].astype(float)

    return df

def delete_ip_port(df):
    """Elimina las columnas 'ip' y 'port'."""
    lista = ['Scr_IP', 'Scr_port', 'Des_IP', 'Des_port', 'Scr_bytes', 'Des_bytes', 'Scr_pkts', 
                            'Des_pkts', 'Scr_ip_bytes', 'Des_ip_bytes', 'Scr_packts_ratio', 'Des_pkts_ratio',
                            'Scr_bytes_ratio', 'Des_bytes_ratio']

    return df.drop(columns=lista)

In [30]:
import numpy as np
# Reemplazos comunes de valores
common_replacements = {
    '-': np.nan,
    '?': np.nan,
    'nan': np.nan,
}

def replace_common_values(df):
    """Reemplaza valores comunes como '-', '?' y 'nan' por NaN."""
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].replace(common_replacements)
    return df

def fix_mayus(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lower()
    return df


In [31]:
from sklearn.impute import KNNImputer, SimpleImputer

# Definir los imputadores
imputers = {
    'categorical': {
        'most_frequent': SimpleImputer(strategy='most_frequent'),
        'knn': KNNImputer(n_neighbors=5)
    },
    'numeric': {
        'mean': SimpleImputer(strategy='mean'),
        'median': SimpleImputer(strategy='median')
    }
}

from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import RobustScaler

# Definir escaladores
scalers = {
    "robust": RobustScaler(),
    "standard": StandardScaler()
}

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Definir codificadores
encoders = {
    "one_hot": OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
    "ordinal": OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
}

In [32]:
imputador_cat= imputers['categorical']['most_frequent']
imputador_num = imputers['numeric']['mean']
normalizacion = scalers['robust']
decodificador = encoders['one_hot']

In [33]:
import numpy as np
from sklearn.decomposition import PCA
from collections import Counter

def matriz_correlacion(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    correlation_matrix = df[numeric_cols].corr()
    return correlation_matrix

def correlacion_pares(df, umbral):
    df = matriz_correlacion(df)
    # Toma solo la parte superior de la matriz para evitar duplicados
    upper_tri = df.where(np.triu(np.ones(df.shape), k=1).astype(bool))

    # Identifica pares altamente correlacionados
    correlated_pairs = []
    for col in upper_tri.columns:
        for row in upper_tri.index:
            if upper_tri.loc[row, col] > umbral:
                correlated_pairs.append((row, col))

    # Selecciona las columnas a eliminar (de cada par, se elimina la que aparece como columna)
    alta_corr_pares = [col for col in upper_tri.columns if any(upper_tri[col] > umbral)]
    
    return alta_corr_pares

def correlacion_respecto_objetivo(df, target, umbral):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    # Calculamos la correlación con la variable objetivo
    target_correlation = df[numeric_cols].corrwith(target).abs().sort_values(ascending=True)

    # Nos quedamos solo con las características que tengan correlación >= 0.1
    baja_corr_respecto_obj = target_correlation[target_correlation < umbral].index.tolist()

    return baja_corr_respecto_obj

def seleccionar_variables_pca(X_train, X_val, n_components, num_top_features):
    """
    Aplica PCA para seleccionar las características más influyentes, pero mantiene los datos originales.
    
    Parámetros:
        - X_train: DataFrame de entrenamiento
        - X_val: DataFrame de validación
        - n_components: float/int, cantidad de componentes principales o porcentaje de varianza a retener
        - num_top_features: int, número de características más influyentes a seleccionar

    Retorna:
        - X_train_filtrado: DataFrame de entrenamiento con las características seleccionadas
        - X_val_filtrado: DataFrame de validación con las características seleccionadas
    """

    # Si usas el sample, cambialo en las lineas necesarias
    # X_train_sample = X_train.sample(n=300000, random_state=42) # Seleccionar una muestra de 300,000 instancias como en el articulo
    
    # Aplicar PCA (sin guardar la transformación)
    pca = PCA(n_components=n_components, random_state=42)
    pca.fit(X_train)  # Solo ajustamos el modelo, no transformamos los datos

    # Obtener nombres originales de las variables
    original_feature_names = np.array(X_train.columns)

    # Contador de importancia de características en PCA
    feature_counter = Counter()
    
    for comp in pca.components_:
        top_indices = np.argsort(np.abs(comp))[-num_top_features:]  # Índices de las más importantes
        top_features = original_feature_names[top_indices]  # Obtener nombres
        feature_counter.update(top_features)  # Contar ocurrencias

    # Seleccionar las variables más influyentes ordenadas por frecuencia de aparición
    variables_pca = [feature for feature, _ in feature_counter.most_common()]

    # Filtrar las variables seleccionadas en los conjuntos de datos
    X_train_filtrado = X_train[variables_pca]
    X_val_filtrado = X_val[variables_pca]
    
    return X_train_filtrado, X_val_filtrado

In [34]:
def calculo_varianza(df):
    """Calcula las varianzas de las columnas de un DataFrame y devuelve las que tienen varianza igual a cero."""
    varianzas = df.var()

    # Identificar columnas con varianza igual a cero
    variables_con_varianza_cero = [col for col, varianza in varianzas.items() if varianza == 0]
    
    return variables_con_varianza_cero


In [35]:
X_train = replace_common_values(X_train)
X_train = fix_mayus(X_train)
X_train = fix_dtype(X_train)
X_train = delete_ip_port(X_train)

y_train_class3 = y_train_class3.loc[X_train.index]
y_train_class2 = y_train_class2.loc[X_train.index]
y_train_class1 = y_train_class1.loc[X_train.index]

X_val = replace_common_values(X_val)
X_val = fix_mayus(X_val)
X_val = fix_dtype(X_val)
X_val = delete_ip_port(X_val)

y_val_class3 = y_val_class3.loc[X_val.index]
y_val_class2 = y_val_class2.loc[X_val.index]
y_val_class1 = y_val_class1.loc[X_val.index]

In [36]:
X_train['Instancia_completa'] = X_train.notnull().all(axis=1).astype(int)
X_val['Instancia_completa'] = X_val.notnull().all(axis=1).astype(int)

completas = X_train['Instancia_completa'].sum()
incompletas = len(X_train) - completas
print(f"✅ Instancias completas: {completas}, incompletas: {incompletas}")
sample_weight_train = X_train['Instancia_completa'].replace({1: 3, 0: 1})

columnas_no_comprobar = [col for col in X_train.columns if col not in ['Timestamp', 'Date', 'Instancia_completa'] and X_train[col].dtypes != 'object']
variables_con_varianza_cero = calculo_varianza(X_train[columnas_no_comprobar])
X_train = X_train.drop(columns=variables_con_varianza_cero)
X_val = X_val.drop(columns=variables_con_varianza_cero)
    
X_train = X_train.drop(columns=['Timestamp', 'Date', 'Instancia_completa'], errors='ignore')
X_val = X_val.drop(columns=['Timestamp', 'Date', 'Instancia_completa'], errors='ignore')

alta_corr_pares = correlacion_pares(X_train, 0.97)
X_train = X_train.drop(columns=alta_corr_pares)
X_val = X_val.drop(columns=alta_corr_pares)

baja_corr_respecto_obj = correlacion_respecto_objetivo(X_train, y_train_class3, 0.025)
X_train = X_train.drop(columns=baja_corr_respecto_obj)
X_val = X_val.drop(columns=baja_corr_respecto_obj)

caracteritisticas_seleccionadas = X_train.columns.tolist()

X_train['Protocol'] = X_train['Protocol'].fillna("missing")
X_val['Protocol'] = X_val['Protocol'].fillna("missing")

✅ Instancias completas: 416933, incompletas: 157650


In [37]:
 # Identificar columnas categóricas, numéricas y booleanas
categorical_cols = X_train.select_dtypes(include=['object']).columns
boolean_cols = X_train.select_dtypes(include=['bool']).columns
if boolean_cols.any():  # Si hay columnas booleanas
    X_train[boolean_cols] = X_train[boolean_cols].astype(float)  # TAL VEZ INNCESESARIO
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

##############################################################################
    
X_train[categorical_cols] = imputador_cat.fit_transform(X_train[categorical_cols])
X_val[categorical_cols] = imputador_cat.transform(X_val[categorical_cols])

X_train[numerical_cols] = imputador_num.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = imputador_num.transform(X_val[numerical_cols])

##############################################################################

X_train_scaled = normalizacion.fit_transform(X_train[numerical_cols])
X_val_scaled = normalizacion.transform(X_val[numerical_cols])

# Convertir las matrices escaladas a DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_train.index)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_val.index)

##############################################################################

X_train_encoded = decodificador.fit_transform(X_train[categorical_cols])
X_val_encoded = decodificador.transform(X_val[categorical_cols])

# Obtener los nombres de las nuevas columnas codificadas
encoded_cols = decodificador.get_feature_names_out(categorical_cols)

# Convertir las matrices codificadas a DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_cols, index=X_train.index)
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=encoded_cols, index=X_val.index)

##############################################################################

# Combinar con las características categóricas codificadas
X_train_processed = pd.concat([X_train_scaled_df, X_train_encoded_df], axis=1)
X_val_processed = pd.concat([X_val_scaled_df, X_val_encoded_df], axis=1)

# Opcional: Reordenar las columnas si es necesario
X_train = X_train_processed.reindex(sorted(X_train_processed.columns), axis=1)
X_val = X_val_processed.reindex(sorted(X_val_processed.columns), axis=1)

In [38]:
X_train, X_val = seleccionar_variables_pca(X_train_processed, X_val_processed, n_components=0.95, num_top_features=20)
caracteritisticas_procesadas = X_train.columns.tolist()

In [39]:
print(X_train.shape, X_val.shape)
print(X_train.isnull().sum().sum(), X_val.isnull().sum().sum())

(574583, 34) (123125, 34)
0 0


In [40]:
from sklearn.preprocessing import LabelEncoder

label_encoder_class2 = LabelEncoder()
y_train_class2 = label_encoder_class2.fit_transform(y_train_class2)
y_val_class2   = label_encoder_class2.transform(y_val_class2)
y_test_class2  = label_encoder_class2.transform(y_test_class2)

mapping_class2 = dict(enumerate(label_encoder_class2.classes_))

label_encoder_class1 = LabelEncoder()
y_train_class1 = label_encoder_class1.fit_transform(y_train_class1)
y_val_class1   = label_encoder_class1.transform(y_val_class1)
y_test_class1  = label_encoder_class1.transform(y_test_class1)

mapping_class1 = dict(enumerate(label_encoder_class1.classes_))


In [41]:
print(mapping_class2)
print(mapping_class1)

{0: 'C&C', 1: 'Exfiltration', 2: 'Exploitation', 3: 'Lateral _movement', 4: 'Normal', 5: 'RDOS', 6: 'Reconnaissance', 7: 'Tampering', 8: 'Weaponization', 9: 'crypto-ransomware'}
{0: 'BruteForce', 1: 'C&C', 2: 'Dictionary', 3: 'Discovering_resources', 4: 'Exfiltration', 5: 'Fake_notification', 6: 'False_data_injection', 7: 'Generic_scanning', 8: 'MQTT_cloud_broker_subscription', 9: 'MitM', 10: 'Modbus_register_reading', 11: 'Normal', 12: 'RDOS', 13: 'Reverse_shell', 14: 'Scanning_vulnerability', 15: 'TCP Relay', 16: 'crypto-ransomware', 17: 'fuzzing', 18: 'insider_malcious'}


In [42]:
input_dim = X_train.shape[1]  # Número de características de entrada
num_classes_2 = len(set(y_train_class2))  # Cantidad de clases en y_class2
num_classes_1 = len(set(y_train_class1))  # Cantidad de clases en y_class1

In [43]:
y_train_class2 = y_train_class2.ravel()
y_val_class2 = y_val_class2.ravel()

In [44]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

# Entrada
input_layer = keras.Input(shape=(input_dim,))

# Capas ocultas
x = layers.Dense(200, activation='relu')(input_layer)
x = layers.Dense(200, activation='relu')(x)
x = layers.Dense(200, activation='relu')(x)

# Salidas
output_class2 = layers.Dense(num_classes_2, activation='softmax', name="output_class2")(x)  # Multiclase Categoria

# Modelo con dos salidas
model_class2 = keras.Model(inputs=input_layer, outputs=output_class2)

# Compilar el modelo
model_class2.compile(optimizer=keras.optimizers.RMSprop(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Definir el callback EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Entrenar el modelo
history = model_class2.fit(X_train,
                    y_train_class2,
                    batch_size=250,
                    epochs=10,
                    validation_data=(X_val, y_val_class2),
                   callbacks=[early_stopping])

# Guardar el modelo con un nombre único
model_name = f"dnn_class2"

# Crear la carpeta si no existe
carpeta_modelos = "modelos_dnn_opcion3"
os.makedirs(carpeta_modelos, exist_ok=True)

# Guardar el modelo dentro de la carpeta
ruta_modelo = os.path.join(carpeta_modelos, f"{model_name}.h5")
model_class2.save(ruta_modelo)

print(f"Modelo guardado en: {ruta_modelo}")


Epoch 1/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9492 - loss: 0.1957 - val_accuracy: 0.9826 - val_loss: 0.0652
Epoch 2/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9828 - loss: 0.0551 - val_accuracy: 0.9821 - val_loss: 0.0620
Epoch 3/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.9846 - loss: 0.0471 - val_accuracy: 0.9856 - val_loss: 0.0456
Epoch 4/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.9861 - loss: 0.0422 - val_accuracy: 0.9859 - val_loss: 0.0411
Epoch 5/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9868 - loss: 0.0389 - val_accuracy: 0.9878 - val_loss: 0.0374
Epoch 6/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9882 - loss: 0.0358 - val_accuracy: 0.9875 - val_loss: 0.0370
Epoch 7/10



Modelo guardado en: modelos_dnn_opcion3\dnn_class2.h5


In [45]:
# Predicción para salida de clase2
y_pred_class2 = model_class2.predict(X_val)
y_pred_class2 = np.argmax(y_pred_class2, axis=1)
print(y_pred_class2.shape)

[1m3848/3848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
(123125,)


In [46]:
class_name_to_number = {value: key for key, value in mapping_class2.items()}

In [47]:
indices_train = np.where(y_train_class3 == 1)[0]
X_train1 = X_train.iloc[indices_train]
y_train_class11 = y_train_class1[indices_train].ravel()
y_train_class21 = y_train_class2[indices_train].ravel()
print(indices_train.shape)
indices_train = np.where(y_train_class2 != class_name_to_number['Normal'])[0]
X_train1 = X_train.iloc[indices_train]
y_train_class11 = y_train_class1[indices_train].ravel()
y_train_class21 = y_train_class2[indices_train].ravel()
print(indices_train.shape)


(279592,)
(279592,)


In [48]:
indices_train = np.where(y_train_class2 != class_name_to_number['Normal'])[0]
X_train = X_train.iloc[indices_train]
y_train_class1 = y_train_class1[indices_train].ravel()
y_train_class2 = y_train_class2[indices_train].ravel()

indices_val = np.where(y_pred_class2 != class_name_to_number['Normal'])[0]
X_val = X_val.iloc[indices_val]
y_val_class1 = y_val_class1[indices_val].ravel()
y_val_class2 = y_val_class2[indices_val].ravel()

y_pred_class2 = y_pred_class2[indices_val]

In [49]:
modelos = []  # Lista para almacenar los modelos

class_names = np.unique(y_train_class2)
for name in class_names:

    indices_train_cat = np.where(y_train_class2 == name)[0]
    X_train_cat = X_train.iloc[indices_train_cat]
    y_train_class1_cat = y_train_class1[indices_train_cat]
    
    if np.unique(y_train_class1_cat).size == 1:
        continue  

    indices_val_cat = np.where(y_pred_class2 == name)[0]
    X_val_cat = X_val.iloc[indices_val_cat]
    y_val_class1_cat = y_val_class1[indices_val_cat]

    # Entrada
    input_layer = keras.Input(shape=(input_dim,))
    
    # Capas ocultas
    x = layers.Dense(200, activation='relu')(input_layer)
    x = layers.Dense(200, activation='relu')(x)
    x = layers.Dense(200, activation='relu')(x)
    
    # Salidas
    output_class1 = layers.Dense(num_classes_1, activation='softmax', name="output_class1")(x)  # Multiclase Categoria
    
    # Modelo con dos salidas
    model = keras.Model(inputs=input_layer, outputs=output_class1)
    
    # Compilar el modelo
    model.compile(optimizer=keras.optimizers.RMSprop(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    # Definir el callback EarlyStopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
    
    # Entrenar el modelo
    history = model.fit(X_train_cat,
                        y_train_class1_cat,
                        batch_size=64,
                        epochs=15,
                        validation_data=(X_val_cat, y_val_class1_cat),
                        callbacks=[early_stopping])
    
    # Guardar el modelo con un nombre único
    nombre = mapping_class2[name]
    model_name = f"{nombre}_class1"
    
    # Crear la carpeta si no existe
    carpeta_modelos = "modelos_dnn_opcion3"
    os.makedirs(carpeta_modelos, exist_ok=True)

    # Guardar el modelo dentro de la carpeta
    ruta_modelo = os.path.join(carpeta_modelos, f"{model_name}.h5")
    model.save(ruta_modelo)

    print(f"Modelo guardado en: {ruta_modelo}")

    
    # Añadir el modelo a la lista
    modelos.append((model_name, model))  # Almacena el nombre y el modelo
    
    print("*" * 50)

Epoch 1/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.6976 - loss: 1.2355 - val_accuracy: 0.9580 - val_loss: 0.3777
Epoch 2/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9694 - loss: 0.1286 - val_accuracy: 0.9720 - val_loss: 0.4154
Epoch 3/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9944 - loss: 0.0364 - val_accuracy: 0.9720 - val_loss: 0.4717
Epoch 4/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9983 - loss: 0.0162 - val_accuracy: 0.9720 - val_loss: 0.5186
Epoch 5/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0056 - val_accuracy: 0.9790 - val_loss: 0.5204
Epoch 6/15
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 1.0000 - loss: 0.0021 - val_accuracy: 0.9790 - val_loss: 0.5458
Epoch 7/15
[1m13/13[0m [32m━━━━━━━



Modelo guardado en: modelos_dnn_opcion3\Exploitation_class1.h5
**************************************************
Epoch 1/15
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9508 - loss: 0.3268 - val_accuracy: 0.9889 - val_loss: 0.1869
Epoch 2/15
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9966 - loss: 0.0714 - val_accuracy: 0.9884 - val_loss: 0.1927
Epoch 3/15
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9982 - loss: 0.0250 - val_accuracy: 0.9893 - val_loss: 0.1810
Epoch 4/15
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9986 - loss: 0.0222 - val_accuracy: 0.9891 - val_loss: 0.1964
Epoch 5/15
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9988 - loss: 0.0264 - val_accuracy: 0.9889 - val_loss: 0.2000
Epoch 6/15
[1m345/345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m



Modelo guardado en: modelos_dnn_opcion3\Lateral _movement_class1.h5
**************************************************
Epoch 1/15
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9568 - loss: 0.1386 - val_accuracy: 0.9876 - val_loss: 0.2998
Epoch 2/15
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9993 - loss: 0.0054 - val_accuracy: 0.9875 - val_loss: 0.3241
Epoch 3/15
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9998 - loss: 0.0065 - val_accuracy: 0.9876 - val_loss: 0.3084
Epoch 4/15
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.9996 - loss: 0.0027 - val_accuracy: 0.9876 - val_loss: 0.3228
Epoch 5/15
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9999 - loss: 4.4699e-04 - val_accuracy: 0.9876 - val_loss: 0.3234
Epoch 6/15
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0



Modelo guardado en: modelos_dnn_opcion3\Reconnaissance_class1.h5
**************************************************
Epoch 1/15
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9151 - loss: 0.5013 - val_accuracy: 0.9712 - val_loss: 14.5006
Epoch 2/15
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0017 - val_accuracy: 0.9738 - val_loss: 14.9617
Epoch 3/15
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 1.7814e-04 - val_accuracy: 0.9738 - val_loss: 16.8154
Epoch 4/15
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 7.4156e-05 - val_accuracy: 0.9738 - val_loss: 17.1003
Epoch 5/15
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 3.5590e-05 - val_accuracy: 0.9738 - val_loss: 17.2975
Epoch 6/15
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1



Modelo guardado en: modelos_dnn_opcion3\Tampering_class1.h5
**************************************************
Epoch 1/15
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9796 - loss: 0.0808 - val_accuracy: 0.9999 - val_loss: 0.0025
Epoch 2/15
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 7.2807e-06 - val_accuracy: 0.9999 - val_loss: 0.0027
Epoch 3/15
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 1.0000 - loss: 5.1221e-06 - val_accuracy: 0.9999 - val_loss: 0.0027
Epoch 4/15
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 2.6241e-06 - val_accuracy: 0.9999 - val_loss: 0.0028
Epoch 5/15
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 2.0596e-06 - val_accuracy: 0.9999 - val_loss: 0.0028
Epoch 6/15
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[



Modelo guardado en: modelos_dnn_opcion3\Weaponization_class1.h5
**************************************************


In [50]:
X_test = replace_common_values(X_test)
X_test = fix_mayus(X_test)
X_test = fix_dtype(X_test)
X_test = delete_ip_port(X_test)

y_test_class3 = y_test_class3[X_test.index]

X_test = X_test[caracteritisticas_seleccionadas] 

X_test['Protocol'] = X_test['Protocol'].fillna("missing")

In [51]:
# Identificar columnas categóricas, numéricas y booleanas
categorical_cols = X_test.select_dtypes(include=['object']).columns
boolean_cols = X_test.select_dtypes(include=['bool']).columns
if boolean_cols.any():  # Si hay columnas booleanas
    X_test[boolean_cols] = X_test[boolean_cols].astype(float) # TAL VEZ INNCESESARIO
numerical_cols = X_test.select_dtypes(include=['float64', 'int64']).columns

##############################################################################

X_test[categorical_cols] = imputador_cat.transform(X_test[categorical_cols])

X_test[numerical_cols] = imputador_num.transform(X_test[numerical_cols])

##############################################################################

X_test_scaled = normalizacion.transform(X_test[numerical_cols])

# Convertir las matrices escaladas a DataFrames
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_test.index)

##############################################################################

X_test_encoded = decodificador.transform(X_test[categorical_cols])

# Obtener los nombres de las nuevas columnas codificadas
encoded_cols = decodificador.get_feature_names_out(categorical_cols)

# Convertir las matrices codificadas a DataFrames
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_cols, index=X_test.index)

##############################################################################

# Combinar con las características categóricas codificadas
X_test_processed = pd.concat([X_test_scaled_df, X_test_encoded_df], axis=1)

# Opcional: Reordenar las columnas si es necesario
X_test_processed = X_test_processed.reindex(sorted(X_test_processed.columns), axis=1)

X_test = X_test_processed[caracteritisticas_procesadas]

In [52]:
y_test_class2 = y_test_class2.ravel()

In [53]:
# Predicción para salida de clase2
y_pred_class2 = model_class2.predict(X_test)
y_pred_class2 = np.argmax(y_pred_class2, axis=1)
y_pred_class2_names = label_encoder_class2.inverse_transform(y_pred_class2)
predicciones_df = pd.DataFrame({'Predicciones_Class2': y_pred_class2_names})    
ruta_directorio = '../../predicciones'
os.makedirs(ruta_directorio, exist_ok=True)
predicciones_df.to_csv(os.path.join(ruta_directorio, 'arq3.csv'), index=False)
    

[1m3848/3848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 994us/step


In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test_class2, y_pred_class2)
print(f'📈 Accuracy (Test): {accuracy:.4f}')
precision = precision_score(y_test_class2, y_pred_class2, average='macro')
print(f'📈 Precision (Test): {precision:.4f}')
recall = recall_score(y_test_class2, y_pred_class2, average='macro')
print(f'📈 Recall (Test): {recall:.4f}')
f1 = f1_score(y_test_class2, y_pred_class2, average='macro')
print(f'📈 F1 (Test): {f1:.4f}')

📈 Accuracy (Test): 0.9893
📈 Precision (Test): 0.9807
📈 Recall (Test): 0.9458
📈 F1 (Test): 0.9618


Clase2 Mapping: {0: 'C&C', 1: 'Exfiltration', 2: 'Exploitation', 3: 'Lateral _movement', 4: 'RDOS', 5: 'Reconnaissance', 6: 'Tampering', 7: 'Weaponization', 8: 'crypto-ransomware'}


Clase1 Mapping: {0: 'BruteForce', 1: 'C&C', 2: 'Dictionary', 3: 'Discovering_resources', 4: 'Exfiltration', 5: 'Fake_notification', 6: 'False_data_injection', 7: 'Generic_scanning', 8: 'MQTT_cloud_broker_subscription', 9: 'MitM', 10: 'Modbus_register_reading', 11: 'RDOS', 12: 'Reverse_shell', 13: 'Scanning_vulnerability', 14: 'TCP Relay', 15: 'crypto-ransomware', 16: 'fuzzing', 17: 'insider_malcious'}

In [55]:
indices_anomalia_test = np.where(y_pred_class2_names != "Normal")[0]
X_test_processed = X_test_processed.iloc[indices_anomalia_test]
y_pred_class2= y_pred_class2[indices_anomalia_test]
y_test_class1 = y_test_class1[indices_anomalia_test].ravel()    

In [56]:
from tensorflow.keras.models import load_model

y_pred_class1_total = ["Normal"] * len(indices_anomalia_test)  # Inicializa con "Normal"
ruta_directorio = '../../predicciones'
predicciones_df = pd.read_csv(os.path.join(ruta_directorio, 'arq3.csv'))
    
class_names = ['Exploitation', 'Lateral _movement', 'Reconnaissance', 'Tampering', 'Weaponization']

for name in class_names:
    
    print(name)

    class_number = class_name_to_number[name]
    # Filtrar índices de la clase
    indices_test_cat = np.where(y_pred_class2 == class_number)[0]

    X_test_processed_cat = X_test.iloc[indices_test_cat]
    y_test_class1_cat = y_test_class1[indices_test_cat]

    # Cargar el modelo correspondiente
    model = load_model(os.path.join("modelos_dnn_opcion3", f"{name}_class1.h5"))

    # Hacer la predicción
    y_pred_class1 = model.predict(X_test_processed_cat)
    y_pred_class1 = np.argmax(y_pred_class1, axis=1)  
    
    y_pred_class1_names = label_encoder_class1.inverse_transform(y_pred_class1)

    # Asignar las predicciones a la lista total usando el índice correcto
    for idx, pred in zip(indices_test_cat, y_pred_class1_names):
        y_pred_class1_total[idx] = pred

    # accuracy = accuracy_score(y_test_class1_cat, y_pred_class1)
    # print(f'📈 Accuracy: {accuracy:.4f}')
    # precision = precision_score(y_test_class1_cat, y_pred_class1, average='macro', zero_division=0)
    # print(f'📈 Precision: {precision:.4f}')
    # recall = recall_score(y_test_class1_cat, y_pred_class1, average='macro')
    # print(f'📈 Recall: {recall:.4f}')
    # f1 = f1_score(y_test_class1_cat, y_pred_class1, average='macro')
    # print(f'📈 F1: {f1:.4f}')  
    # print("*" * 50 + "\n")
    
    predicciones_df.loc[indices_anomalia_test, 'Predicciones_Class1'] = y_pred_class1_total 
        
normal_indices = predicciones_df[predicciones_df['Predicciones_Class2'] == "RDOS"].index
predicciones_df.loc[normal_indices, 'Predicciones_Class1'] = "RDOS"
normal_indices = predicciones_df[predicciones_df['Predicciones_Class2'] == "Exfiltration"].index
predicciones_df.loc[normal_indices, 'Predicciones_Class1'] = "Exfiltration"
normal_indices = predicciones_df[predicciones_df['Predicciones_Class2'] == "C&C"].index
predicciones_df.loc[normal_indices, 'Predicciones_Class1'] = "C&C"
normal_indices = predicciones_df[predicciones_df['Predicciones_Class2'] == "crypto-ransomware"].index
predicciones_df.loc[normal_indices, 'Predicciones_Class1'] = "crypto-ransomware"
normal_indices = predicciones_df[predicciones_df['Predicciones_Class2'] == "Normal"].index
predicciones_df.loc[normal_indices, 'Predicciones_Class1'] = "Normal"
os.makedirs(ruta_directorio, exist_ok=True)
predicciones_df.to_csv(os.path.join(ruta_directorio, 'arq3.csv'), index=False)



Exploitation
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step




Lateral _movement
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step




Reconnaissance
[1m584/584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Tampering




[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Weaponization




[1m318/318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
