In [119]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

path = os.path.join("X-IIoTID dataset.csv")  
try:
    df = pd.read_csv(path, low_memory=False)
    print(f"✅ Datos cargados: {df.shape[0]} filas, {df.shape[1]} columnas.")
except FileNotFoundError:
    print("❌ Error: No se encontró el archivo.")

✅ Datos cargados: 820834 filas, 68 columnas.


In [120]:
random_state = 42

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820834 entries, 0 to 820833
Data columns (total 68 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Date                         820503 non-null  object
 1   Timestamp                    820537 non-null  object
 2   Scr_IP                       820834 non-null  object
 3   Scr_port                     820834 non-null  object
 4   Des_IP                       820834 non-null  object
 5   Des_port                     820834 non-null  object
 6   Protocol                     820834 non-null  object
 7   Service                      820834 non-null  object
 8   Duration                     820834 non-null  object
 9   Scr_bytes                    820834 non-null  object
 10  Des_bytes                    820834 non-null  object
 11  Conn_state                   820834 non-null  int64 
 12  missed_bytes                 820834 non-null  object
 13  is_syn_only   

In [122]:
from sklearn.model_selection import train_test_split

"""Divide los datos en entrenamiento, validación y prueba."""
X = df.drop(columns=['class1', 'class2', 'class3'])
y_class3 = df['class3'].map({'Normal': 0, 'Attack': 1})
y_class2 = df['class2']
y_class1 = df['class1']

X_train, X_temp, y_train_class3, y_temp_class3, y_train_class2, y_temp_class2, y_train_class1, y_temp_class1 = train_test_split(
    X, y_class3, y_class2, y_class1, test_size=0.3, random_state=random_state, stratify=y_class3, shuffle=True
)

X_val, X_test, y_val_class3, y_test_class3, y_val_class2, y_test_class2, y_val_class1, y_test_class1 = train_test_split(
    X_temp, y_temp_class3, y_temp_class2, y_temp_class1, test_size=0.5, random_state=random_state, stratify=y_temp_class3, shuffle=True
)

# Resetear índices para evitar desalineaciones
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)  # Opcional

y_train_class3 = y_train_class3.reset_index(drop=True)
y_val_class3 = y_val_class3.reset_index(drop=True)
y_test_class3 = y_test_class3.reset_index(drop=True)  # Opcional

y_train_class2 = y_train_class2.reset_index(drop=True)
y_val_class2 = y_val_class2.reset_index(drop=True)
y_test_class2 = y_test_class2.reset_index(drop=True)  # Opcional

y_train_class1 = y_train_class1.reset_index(drop=True)
y_val_class1 = y_val_class1.reset_index(drop=True)
y_test_class1 = y_test_class1.reset_index(drop=True)  # Opcional

In [123]:
import pandas as pd
def fix_dtype(df, umbral_numerico=0.7):
    object_cols = df.select_dtypes(include=['object']).columns
    int_cols = df.select_dtypes(include=['int64']).columns
    bool_cols = df.select_dtypes(include=['bool']).columns

    # Convertir booleanos a float
    df[bool_cols] = df[bool_cols].astype(float)

    for col in object_cols:
        valores_unicos = df[col].dropna().unique()

        if {"true", "false"} <= set(valores_unicos):  # Verifica si ambos existen
            df[col] = df[col].map({'true': 1, 'false': 0}).astype(float)
        else:
            converted = pd.to_numeric(df[col], errors='coerce')
            if converted.notna().mean() > umbral_numerico:
                df[col] = converted.astype(float)

    for col in int_cols:
        df[col] = df[col].astype(float)

    return df

def delete_ip_port(df):
    """Elimina las columnas 'ip' y 'port'."""
    lista = ['Scr_IP', 'Scr_port', 'Des_IP', 'Des_port', 'Scr_bytes', 'Des_bytes', 'Scr_pkts', 
                            'Des_pkts', 'Scr_ip_bytes', 'Des_ip_bytes', 'Scr_packts_ratio', 'Des_pkts_ratio',
                            'Scr_bytes_ratio', 'Des_bytes_ratio']

    return df.drop(columns=lista)

In [124]:
import numpy as np
# Reemplazos comunes de valores
common_replacements = {
    '-': np.nan,
    '?': np.nan,
    'nan': np.nan,
}

def replace_common_values(df):
    """Reemplaza valores comunes como '-', '?' y 'nan' por NaN."""
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].replace(common_replacements)
    return df

def fix_mayus(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lower()
    return df


In [125]:
from sklearn.impute import KNNImputer, SimpleImputer

# Definir los imputadores
imputers = {
    'categorical': {
        'most_frequent': SimpleImputer(strategy='most_frequent'),
        'knn': KNNImputer(n_neighbors=5)
    },
    'numeric': {
        'mean': SimpleImputer(strategy='mean'),
        'median': SimpleImputer(strategy='median')
    }
}

from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import RobustScaler

# Definir escaladores
scalers = {
    "robust": RobustScaler(),
    "standard": StandardScaler()
}

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Definir codificadores
encoders = {
    "one_hot": OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
    "ordinal": OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
}

In [126]:
imputador_cat= imputers['categorical']['most_frequent']
imputador_num = imputers['numeric']['mean']
normalizacion = scalers['robust']
decodificador = encoders['one_hot']

In [127]:
import numpy as np
from sklearn.decomposition import PCA
from collections import Counter

def matriz_correlacion(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    correlation_matrix = df[numeric_cols].corr()
    return correlation_matrix

def correlacion_pares(df, umbral):
    df = matriz_correlacion(df)
    # Toma solo la parte superior de la matriz para evitar duplicados
    upper_tri = df.where(np.triu(np.ones(df.shape), k=1).astype(bool))

    # Identifica pares altamente correlacionados
    correlated_pairs = []
    for col in upper_tri.columns:
        for row in upper_tri.index:
            if upper_tri.loc[row, col] > umbral:
                correlated_pairs.append((row, col))

    # Selecciona las columnas a eliminar (de cada par, se elimina la que aparece como columna)
    alta_corr_pares = [col for col in upper_tri.columns if any(upper_tri[col] > umbral)]
    
    return alta_corr_pares

def correlacion_respecto_objetivo(df, target, umbral):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    # Calculamos la correlación con la variable objetivo
    target_correlation = df[numeric_cols].corrwith(target).abs().sort_values(ascending=True)

    # Nos quedamos solo con las características que tengan correlación >= 0.1
    baja_corr_respecto_obj = target_correlation[target_correlation < umbral].index.tolist()

    return baja_corr_respecto_obj

def seleccionar_variables_pca(X_train, X_val, n_components, num_top_features):
    """
    Aplica PCA para seleccionar las características más influyentes, pero mantiene los datos originales.
    
    Parámetros:
        - X_train: DataFrame de entrenamiento
        - X_val: DataFrame de validación
        - n_components: float/int, cantidad de componentes principales o porcentaje de varianza a retener
        - num_top_features: int, número de características más influyentes a seleccionar

    Retorna:
        - X_train_filtrado: DataFrame de entrenamiento con las características seleccionadas
        - X_val_filtrado: DataFrame de validación con las características seleccionadas
    """

    # Si usas el sample, cambialo en las lineas necesarias
    # X_train_sample = X_train.sample(n=300000, random_state=42) # Seleccionar una muestra de 300,000 instancias como en el articulo
    
    # Aplicar PCA (sin guardar la transformación)
    pca = PCA(n_components=n_components, random_state=42)
    pca.fit(X_train)  # Solo ajustamos el modelo, no transformamos los datos

    # Obtener nombres originales de las variables
    original_feature_names = np.array(X_train.columns)

    # Contador de importancia de características en PCA
    feature_counter = Counter()
    
    for comp in pca.components_:
        top_indices = np.argsort(np.abs(comp))[-num_top_features:]  # Índices de las más importantes
        top_features = original_feature_names[top_indices]  # Obtener nombres
        feature_counter.update(top_features)  # Contar ocurrencias

    # Seleccionar las variables más influyentes ordenadas por frecuencia de aparición
    variables_pca = [feature for feature, _ in feature_counter.most_common()]

    # Filtrar las variables seleccionadas en los conjuntos de datos
    X_train_filtrado = X_train[variables_pca]
    X_val_filtrado = X_val[variables_pca]
    print("Características seleccionadas por PCA:", variables_pca)
    return X_train_filtrado, X_val_filtrado

In [128]:
def calculo_varianza(df):
    """Calcula las varianzas de las columnas de un DataFrame y devuelve las que tienen varianza igual a cero."""
    varianzas = df.var()

    # Identificar columnas con varianza igual a cero
    variables_con_varianza_cero = [col for col, varianza in varianzas.items() if varianza == 0]
    
    return variables_con_varianza_cero


In [129]:
X_train = replace_common_values(X_train)
X_train = fix_mayus(X_train)
X_train = fix_dtype(X_train)
X_train = delete_ip_port(X_train)

y_train_class3 = y_train_class3.loc[X_train.index]
y_train_class2 = y_train_class2.loc[X_train.index]
y_train_class1 = y_train_class1.loc[X_train.index]

X_val = replace_common_values(X_val)
X_val = fix_mayus(X_val)
X_val = fix_dtype(X_val)
X_val = delete_ip_port(X_val)

y_val_class3 = y_val_class3.loc[X_val.index]
y_val_class2 = y_val_class2.loc[X_val.index]
y_val_class1 = y_val_class1.loc[X_val.index]

In [130]:

X_train['Instancia_completa'] = X_train.notnull().all(axis=1).astype(int)
X_val['Instancia_completa'] = X_val.notnull().all(axis=1).astype(int)

completas = X_train['Instancia_completa'].sum()
incompletas = len(X_train) - completas
print(f"✅ Instancias completas: {completas}, incompletas: {incompletas}")
sample_weight_train = X_train['Instancia_completa'].replace({1: 3, 0: 1})

columnas_no_comprobar = [col for col in X_train.columns if col not in ['Timestamp', 'Date', 'Instancia_completa'] and X_train[col].dtypes != 'object']
variables_con_varianza_cero = calculo_varianza(X_train[columnas_no_comprobar])
X_train = X_train.drop(columns=variables_con_varianza_cero)
X_val = X_val.drop(columns=variables_con_varianza_cero)

X_train = X_train.drop(columns=['Timestamp', 'Date', 'Instancia_completa'], errors='ignore')
X_val = X_val.drop(columns=['Timestamp', 'Date', 'Instancia_completa'], errors='ignore')

alta_corr_pares = correlacion_pares(X_train, 0.97)
X_train = X_train.drop(columns=alta_corr_pares)
X_val = X_val.drop(columns=alta_corr_pares)

baja_corr_respecto_obj = correlacion_respecto_objetivo(X_train, y_train_class3, 0.025)
X_train = X_train.drop(columns=baja_corr_respecto_obj)
X_val = X_val.drop(columns=baja_corr_respecto_obj)

caracteritisticas_seleccionadas = X_train.columns.tolist()

X_train['Protocol'] = X_train['Protocol'].fillna("missing")
X_val['Protocol'] = X_val['Protocol'].fillna("missing")

✅ Instancias completas: 416933, incompletas: 157650


In [131]:
 # Identificar columnas categóricas, numéricas y booleanas
categorical_cols = X_train.select_dtypes(include=['object']).columns
boolean_cols = X_train.select_dtypes(include=['bool']).columns
if boolean_cols.any():  # Si hay columnas booleanas
    X_train[boolean_cols] = X_train[boolean_cols].astype(float)  # TAL VEZ INNCESESARIO
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

##############################################################################
    
X_train[categorical_cols] = imputador_cat.fit_transform(X_train[categorical_cols])
X_val[categorical_cols] = imputador_cat.transform(X_val[categorical_cols])

X_train[numerical_cols] = imputador_num.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = imputador_num.transform(X_val[numerical_cols])

##############################################################################

X_train_scaled = normalizacion.fit_transform(X_train[numerical_cols])
X_val_scaled = normalizacion.transform(X_val[numerical_cols])

# Convertir las matrices escaladas a DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_train.index)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_val.index)

##############################################################################

X_train_encoded = decodificador.fit_transform(X_train[categorical_cols])
X_val_encoded = decodificador.transform(X_val[categorical_cols])

# Obtener los nombres de las nuevas columnas codificadas
encoded_cols = decodificador.get_feature_names_out(categorical_cols)

# Convertir las matrices codificadas a DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_cols, index=X_train.index)
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=encoded_cols, index=X_val.index)

##############################################################################

# Combinar con las características categóricas codificadas
X_train_processed = pd.concat([X_train_scaled_df, X_train_encoded_df], axis=1)
X_val_processed = pd.concat([X_val_scaled_df, X_val_encoded_df], axis=1)

# Opcional: Reordenar las columnas si es necesario
X_train = X_train_processed.reindex(sorted(X_train_processed.columns), axis=1)
X_val = X_val_processed.reindex(sorted(X_val_processed.columns), axis=1)

In [132]:
X_train, X_val = seleccionar_variables_pca(X_train_processed, X_val_processed, n_components=0.95, num_top_features=20)
caracteritisticas_procesadas = X_train.columns.tolist()

Características seleccionadas por PCA: ['is_syn_only_scaled', 'Avg_ideal_time_scaled', 'Avg_system_time_scaled', 'std_num_cswch/s_scaled', 'paket_rate_scaled', 'Avg_num_cswch/s_scaled', 'total_bytes_scaled', 'Protocol_tcp', 'Is_SYN_ACK_scaled', 'Std_user_time_scaled', 'Avg_nice_time_scaled', 'Std_system_time_scaled', 'read_write_physical.process_scaled', 'Avg_user_time_scaled', 'Avg_kbmemused_scaled', 'Duration_scaled', 'Protocol_udp', 'Service_coap', 'Service_dns', 'Avg_iowait_time_scaled', 'total_packet_scaled', 'Avg_tps_scaled', 'Service_http', 'Avg_ldavg_1_scaled', 'Std_nice_time_scaled', 'Service_websocket', 'OSSEC_alert_scaled', 'Avg_rtps_scaled', 'Service_mqtt', 'is_with_payload_scaled', 'anomaly_alert_scaled', 'File_activity_scaled', 'Login_attempt_scaled', 'Std_wtps_scaled']


In [133]:
print(X_train.shape, X_val.shape)
print(X_train.isnull().sum().sum(), X_val.isnull().sum().sum())
print(y_train_class3.value_counts(normalize=True))
print(y_val_class3.value_counts(normalize=True))

(574583, 34) (123125, 34)
0 0
class3
0    0.5134
1    0.4866
Name: proportion, dtype: float64
class3
0    0.513405
1    0.486595
Name: proportion, dtype: float64


In [134]:
from sklearn.preprocessing import LabelEncoder

label_encoder_class3 = LabelEncoder()
y_train_class3 = label_encoder_class3.fit_transform(y_train_class3)
y_val_class3   = label_encoder_class3.transform(y_val_class3)
y_test_class3  = label_encoder_class3.transform(y_test_class3)

mapping_class3 = dict(enumerate(label_encoder_class3.classes_))


label_encoder_class2 = LabelEncoder()
y_train_class2 = label_encoder_class2.fit_transform(y_train_class2)
y_val_class2   = label_encoder_class2.transform(y_val_class2)
y_test_class2  = label_encoder_class2.transform(y_test_class2)

mapping_class2 = dict(enumerate(label_encoder_class2.classes_))

label_encoder_class1 = LabelEncoder()
y_train_class1 = label_encoder_class1.fit_transform(y_train_class1)
y_val_class1   = label_encoder_class1.transform(y_val_class1)
y_test_class1  = label_encoder_class1.transform(y_test_class1)

mapping_class1 = dict(enumerate(label_encoder_class1.classes_))


In [135]:
print(mapping_class3)
print(mapping_class2)
print(mapping_class1)

{0: 0, 1: 1}
{0: 'C&C', 1: 'Exfiltration', 2: 'Exploitation', 3: 'Lateral _movement', 4: 'Normal', 5: 'RDOS', 6: 'Reconnaissance', 7: 'Tampering', 8: 'Weaponization', 9: 'crypto-ransomware'}
{0: 'BruteForce', 1: 'C&C', 2: 'Dictionary', 3: 'Discovering_resources', 4: 'Exfiltration', 5: 'Fake_notification', 6: 'False_data_injection', 7: 'Generic_scanning', 8: 'MQTT_cloud_broker_subscription', 9: 'MitM', 10: 'Modbus_register_reading', 11: 'Normal', 12: 'RDOS', 13: 'Reverse_shell', 14: 'Scanning_vulnerability', 15: 'TCP Relay', 16: 'crypto-ransomware', 17: 'fuzzing', 18: 'insider_malcious'}


In [136]:
input_dim = X_train.shape[1]  # Número de características de entrada
num_classes_2 = len(set(y_train_class2))  # Cantidad de clases en y_class2
num_classes_1 = len(set(y_train_class1))  # Cantidad de clases en y_class1

In [137]:
timesteps = 1
# Convertir DataFrame a NumPy antes de redimensionar
X_train = np.array(X_train).reshape((X_train.shape[0], timesteps, X_train.shape[1]))
X_val = np.array(X_val).reshape((X_val.shape[0], timesteps, X_val.shape[1]))

In [138]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

# Entrada
input_layer = keras.Input(shape=(timesteps, input_dim))  # Se necesita un input secuencial

# Capas ocultas con GRU
x = layers.GRU(200, return_sequences=True)(input_layer)
x = layers.GRU(200, return_sequences=True)(x)
x = layers.GRU(200)(x)  # Última capa sin return_sequences

# Salidas
output_class3 = layers.Dense(1, activation='sigmoid', name="output_class3")(x)  # Binaria

# Modelo con dos salidas
model_class1 = keras.Model(inputs=input_layer, outputs=output_class3)

# Compilar el modelo
model_class1.compile(optimizer=keras.optimizers.RMSprop(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Definir el callback EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Entrenar el modelo
history = model_class1.fit(X_train,
                    y_train_class3,
                    batch_size=250,
                    epochs=1,
                    validation_data=(X_val, y_val_class3),
                   callbacks=[early_stopping])

# Guardar el modelo con un nombre único
model_name = f"modelo_class1"

# Crear la carpeta si no existe
carpeta_modelos = "modelos_cascada_gru"
os.makedirs(carpeta_modelos, exist_ok=True)

# Guardar el modelo dentro de la carpeta
ruta_modelo = os.path.join(carpeta_modelos, f"{model_name}.h5")
model_class1.save(ruta_modelo)

print(f"Modelo guardado en: {ruta_modelo}")



[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 20ms/step - accuracy: 0.9575 - loss: 0.1276 - val_accuracy: 0.9789 - val_loss: 0.0627




Modelo guardado en: modelos_cascada_gru\modelo_class1.h5


In [139]:
# Predicción para salida de clase3
y_pred_class3 = model_class1.predict(X_val)
y_pred_class3 = (y_pred_class3 >= 0.5).astype(int)
print(y_pred_class3.shape)

[1m3848/3848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step
(123125, 1)


In [140]:
indices_train = np.where(y_train_class3 == 1)[0]
X_train = X_train[indices_train]
y_train_class2 = y_train_class2[indices_train]
y_train_class2 = y_train_class2.ravel()

In [141]:
indices_val = np.where(y_pred_class3 == 1)[0]
X_val = X_val[indices_val]
y_val_class2 = y_val_class2[indices_val].ravel()

print(X_val.shape, y_val_class2.shape)

(58217, 1, 34) (58217,)


In [142]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

# Entrada
input_layer = keras.Input(shape=(timesteps, input_dim))  # Se necesita un input secuencial

# Capas ocultas con GRU
x = layers.GRU(200, return_sequences=True)(input_layer)
x = layers.GRU(200, return_sequences=True)(x)
x = layers.GRU(200)(x)  # Última capa sin return_sequences

# Salidas
output_class2 = layers.Dense(num_classes_2, activation='softmax', name="output_class2")(x)  # Binaria

# Modelo con dos salidas
model_class2= keras.Model(inputs=input_layer, outputs=output_class2)

# Compilar el modelo
model_class2.compile(optimizer=keras.optimizers.RMSprop(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Definir el callback EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Entrenar el modelo
history = model_class2.fit(X_train,
                    y_train_class2,
                    batch_size=250,
                    epochs=1,
                    validation_data=(X_val, y_val_class2),
                   callbacks=[early_stopping])

# Guardar el modelo con un nombre único
model_name = f"modelo_class2"

# Crear la carpeta si no existe
carpeta_modelos = "modelos_cascada_gru"
os.makedirs(carpeta_modelos, exist_ok=True)

# Guardar el modelo dentro de la carpeta
ruta_modelo = os.path.join(carpeta_modelos, f"{model_name}.h5")
model_class1.save(ruta_modelo)

print(f"Modelo guardado en: {ruta_modelo}")



[1m1119/1119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 19ms/step - accuracy: 0.9390 - loss: 0.2109 - val_accuracy: 0.9871 - val_loss: 0.1024




Modelo guardado en: modelos_cascada_gru\modelo_class2.h5


In [143]:
# Predicción para salida de clase2
y_pred_class2 = model_class2.predict(X_val)
y_pred_class2 = np.argmax(y_pred_class2, axis=1)
print(y_pred_class2)

[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step
[3 6 8 ... 8 5 5]


In [144]:
y_train_class1 = y_train_class1[indices_train].ravel()
y_val_class1 = y_val_class1[indices_val].ravel()

In [145]:
modelos = []  # Lista para almacenar los modelos

class_names = np.unique(y_train_class2)
for name in class_names:

    indices_train_cat = np.where(y_train_class2 == name)[0]
    X_train_cat = X_train[indices_train_cat]
    y_train_class1_cat = y_train_class1[indices_train_cat]
    
    if np.unique(y_train_class1_cat).size == 1:
        continue  

    indices_val_cat = np.where(y_pred_class2 == name)[0]
    X_val_cat = X_val[indices_val_cat]
    y_val_class1_cat = y_val_class1[indices_val_cat]
    
    # Entrada
    input_layer = keras.Input(shape=(timesteps, input_dim))  # Se necesita un input secuencial

    # Capas ocultas con GRU
    x = layers.GRU(200, return_sequences=True)(input_layer)
    x = layers.GRU(200, return_sequences=True)(x)
    x = layers.GRU(200)(x)  # Última capa sin return_sequences

    # Salidas
    output_class1 = layers.Dense(num_classes_1, activation='softmax', name="output_class1")(x)  # Binaria

    # Modelo con dos salidas
    model= keras.Model(inputs=input_layer, outputs=output_class1)

    # Compilar el modelo
    model.compile(optimizer=keras.optimizers.RMSprop(),
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

    # Definir el callback EarlyStopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Entrenar el modelo
    history = model.fit(X_train_cat,
                        y_train_class1_cat,
                        batch_size=250,
                        epochs=10,
                        validation_data=(X_val_cat, y_val_class1_cat),
                        callbacks=[early_stopping])

    # Guardar el modelo con un nombre único
    nombre = mapping_class2[name]
    model_name = f"modelo_{nombre}"
    
    # Crear la carpeta si no existe
    carpeta_modelos = "modelos_cascada_gru"
    os.makedirs(carpeta_modelos, exist_ok=True)

    # Guardar el modelo dentro de la carpeta
    ruta_modelo = os.path.join(carpeta_modelos, f"{model_name}.h5")
    model.save(ruta_modelo)

    print(f"Modelo guardado en: {ruta_modelo}")

    
    # Añadir el modelo a la lista
    modelos.append((model_name, model))  # Almacena el nombre y el modelo
    
    print("*" * 50)


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 273ms/step - accuracy: 0.4588 - loss: 2.6648 - val_accuracy: 0.9224 - val_loss: 0.7409
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.8948 - loss: 0.8536 - val_accuracy: 0.9224 - val_loss: 0.4805
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.8909 - loss: 0.4860 - val_accuracy: 0.9224 - val_loss: 0.5520
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.8935 - loss: 0.3371 - val_accuracy: 0.9224 - val_loss: 0.6255
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.9009 - loss: 0.2487 - val_accuracy: 0.9224 - val_loss: 0.6800




Modelo guardado en: modelos_cascada_gru\modelo_Exploitation.h5
**************************************************
Epoch 1/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 27ms/step - accuracy: 0.8722 - loss: 0.7564 - val_accuracy: 0.9722 - val_loss: 0.2509
Epoch 2/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.9949 - loss: 0.0187 - val_accuracy: 0.9724 - val_loss: 0.2807
Epoch 3/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.9982 - loss: 0.0080 - val_accuracy: 0.9724 - val_loss: 0.2959
Epoch 4/10
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.9990 - loss: 0.0057 - val_accuracy: 0.9722 - val_loss: 0.3034




Modelo guardado en: modelos_cascada_gru\modelo_Lateral _movement.h5
**************************************************
Epoch 1/10
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 31ms/step - accuracy: 0.8264 - loss: 0.5394 - val_accuracy: 0.9524 - val_loss: 0.3638
Epoch 2/10
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.9842 - loss: 0.0542 - val_accuracy: 0.9714 - val_loss: 0.3671
Epoch 3/10
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.9963 - loss: 0.0137 - val_accuracy: 0.9713 - val_loss: 0.4013
Epoch 4/10
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.9979 - loss: 0.0078 - val_accuracy: 0.9714 - val_loss: 0.4243




Modelo guardado en: modelos_cascada_gru\modelo_Reconnaissance.h5
**************************************************
Epoch 1/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 79ms/step - accuracy: 0.8104 - loss: 1.5594 - val_accuracy: 0.9890 - val_loss: 0.0868
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9947 - loss: 0.0388 - val_accuracy: 0.9890 - val_loss: 0.0899
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.9954 - loss: 0.0257 - val_accuracy: 0.9890 - val_loss: 0.0973
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9943 - loss: 0.0242 - val_accuracy: 0.9890 - val_loss: 0.1055




Modelo guardado en: modelos_cascada_gru\modelo_Tampering.h5
**************************************************
Epoch 1/10
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 22ms/step - accuracy: 0.9206 - loss: 0.3933 - val_accuracy: 0.9978 - val_loss: 0.0246
Epoch 2/10
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 1.0000 - loss: 3.2830e-04 - val_accuracy: 0.9978 - val_loss: 0.0260
Epoch 3/10
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 1.0000 - loss: 1.1112e-04 - val_accuracy: 0.9978 - val_loss: 0.0270
Epoch 4/10
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 1.0000 - loss: 6.0044e-05 - val_accuracy: 0.9978 - val_loss: 0.0276




Modelo guardado en: modelos_cascada_gru\modelo_Weaponization.h5
**************************************************


In [146]:
predicciones = {}  # Diccionario para almacenar las predicciones de cada modelo

for model_name, model in modelos:
    y_pred_class2 = model.predict(X_val)
    y_pred_class2 = np.argmax(y_pred_class2, axis=1)
    
    # Guardar las predicciones en un diccionario con el nombre del modelo
    predicciones[model_name] = y_pred_class2

# Mostrar las predicciones de cada modelo
for name, pred in predicciones.items():
    print(f"Predicciones para {name}:")
    print(pred)

[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step
[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step
Predicciones para modelo_Exploitation:
[13 13 13 ... 13 13 13]
Predicciones para modelo_Lateral _movement:
[10 15 15 ... 15 10 10]
Predicciones para modelo_Reconnaissance:
[17 14 14 ... 14 14  3]
Predicciones para modelo_Tampering:
[ 6  6  6 ...  6 13 14]
Predicciones para modelo_Weaponization:
[2 0 0 ... 0 0 0]


In [147]:
X_test = replace_common_values(X_test)
X_test = fix_mayus(X_test)
X_test = fix_dtype(X_test)
X_test = delete_ip_port(X_test)

y_test_class3 = y_test_class3[X_test.index]

X_test = X_test[caracteritisticas_seleccionadas] 

X_test['Protocol'] = X_test['Protocol'].fillna("missing")

In [148]:
# Identificar columnas categóricas, numéricas y booleanas
categorical_cols = X_test.select_dtypes(include=['object']).columns
boolean_cols = X_test.select_dtypes(include=['bool']).columns
if boolean_cols.any():  # Si hay columnas booleanas
    X_test[boolean_cols] = X_test[boolean_cols].astype(float) # TAL VEZ INNCESESARIO
numerical_cols = X_test.select_dtypes(include=['float64', 'int64']).columns

##############################################################################

X_test[categorical_cols] = imputador_cat.transform(X_test[categorical_cols])

X_test[numerical_cols] = imputador_num.transform(X_test[numerical_cols])

##############################################################################

X_test_scaled = normalizacion.transform(X_test[numerical_cols])

# Convertir las matrices escaladas a DataFrames
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_test.index)

##############################################################################

X_test_encoded = decodificador.transform(X_test[categorical_cols])

# Obtener los nombres de las nuevas columnas codificadas
encoded_cols = decodificador.get_feature_names_out(categorical_cols)

# Convertir las matrices codificadas a DataFrames
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_cols, index=X_test.index)

##############################################################################

# Combinar con las características categóricas codificadas
X_test_processed = pd.concat([X_test_scaled_df, X_test_encoded_df], axis=1)

# Opcional: Reordenar las columnas si es necesario
X_test_processed = X_test_processed.reindex(sorted(X_test_processed.columns), axis=1)

X_test = X_test_processed[caracteritisticas_procesadas]

In [149]:
# Convertir DataFrame a NumPy antes de redimensionar
X_test = np.array(X_test).reshape((X_test.shape[0], timesteps, X_test.shape[1]))

In [150]:
# Predicción para salida de clase3
y_pred_class3 = model_class1.predict(X_test)
y_pred_class3 = (y_pred_class3 >= 0.5).astype(int)
print(y_pred_class3)

[1m3848/3848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step
[[0]
 [1]
 [0]
 ...
 [1]
 [1]
 [1]]


In [151]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_class3, y_pred_class3)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9794


In [152]:
indices_test = np.where(y_pred_class3 == 1)[0]
X_test = X_test[indices_test]
y_test_class2 = y_test_class2[indices_test].ravel()

In [153]:
# Predicción para salida de clase2
y_pred_class2 = model_class2.predict(X_test)
y_pred_class2 = np.argmax(y_pred_class2, axis=1)

[1m1820/1820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step


In [154]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_class2, y_pred_class2)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9872


In [155]:
y_test_class1 = y_test_class1[indices_test].ravel()

In [156]:
# Crear un diccionario con {nombre: número de clase}
class_name_to_number = {value: key for key, value in mapping_class2.items()}

print("Mapeo de nombres a números:", class_name_to_number)


Mapeo de nombres a números: {'C&C': 0, 'Exfiltration': 1, 'Exploitation': 2, 'Lateral _movement': 3, 'Normal': 4, 'RDOS': 5, 'Reconnaissance': 6, 'Tampering': 7, 'Weaponization': 8, 'crypto-ransomware': 9}


In [158]:
from tensorflow.keras.models import load_model

class_names = ['Exploitation', 'Lateral _movement', 'Reconnaissance', 'Tampering', 'Weaponization']

for name in class_names:
    # Obtener el número de clase correspondiente al nombre
    if name not in class_name_to_number:
        print(f"Clase {name} no encontrada en el mapeo, saltando...")
        continue  

    class_number = class_name_to_number[name]

    # Filtrar índices de la clase
    indices_test_cat = np.where(y_pred_class2 == class_number)[0]

    # Verificar si hay datos antes de continuar
    if len(indices_test_cat) == 0:
        print(f"No hay datos para la clase {name}, saltando...")
        continue  

    X_test_processed_cat = X_test[indices_test_cat]
    y_test_class1_cat = y_test_class1[indices_test_cat]

    print(f"Clase {name}, Valores únicos en y_test_class1_cat:", np.unique(y_test_class1_cat, return_counts=True))

    # Cargar el modelo correspondiente
    model = load_model(os.path.join("modelos_cascada_gru", f"modelo_{name}.h5"))

    # Hacer la predicción
    y_pred_class1 = model.predict(X_test_processed_cat)
    y_pred_class1 = np.argmax(y_pred_class1, axis=1)  

    accuracy = accuracy_score(y_test_class1_cat, y_pred_class1)
    print(f"Accuracy para {name}: {accuracy:.4f}")

    print("*" * 50 + "\n")



Clase Exploitation, Valores únicos en y_test_class1_cat: (array([ 5, 11, 13, 15]), array([ 2,  1, 88,  6], dtype=int64))
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 211ms/step
Accuracy para Exploitation: 0.9072
**************************************************

Clase Lateral _movement, Valores únicos en y_test_class1_cat: (array([ 6,  7,  8, 10, 11, 13, 14, 15, 16]), array([   8,    8, 3361,  840,   88,   14,    2,  212,    1], dtype=int64))




[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step




Accuracy para Lateral _movement: 0.9711
**************************************************

Clase Reconnaissance, Valores únicos en y_test_class1_cat: (array([ 0,  1,  2,  3,  4,  7,  8, 10, 11, 12, 13, 14, 15, 17]), array([  31,    5,    9, 1987,    2, 7509,  162,    3,  268,    6,   19,
       7907,   12,   59], dtype=int64))
[1m562/562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step
Accuracy para Reconnaissance: 0.9537
**************************************************

Clase Tampering, Valores únicos en y_test_class1_cat: (array([6, 8]), array([722,  10], dtype=int64))




[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step




Accuracy para Tampering: 0.9863
**************************************************

Clase Weaponization, Valores únicos en y_test_class1_cat: (array([ 0,  2,  5,  8, 11, 14, 18]), array([7121,  364,    1,    1,   10,    3, 2638], dtype=int64))
[1m317/317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Accuracy para Weaponization: 0.9985
**************************************************

