In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

path = os.path.join("X-IIoTID dataset.csv")  
try:
    df = pd.read_csv(path, low_memory=False)
    print(f"✅ Datos cargados: {df.shape[0]} filas, {df.shape[1]} columnas.")
except FileNotFoundError:
    print("❌ Error: No se encontró el archivo.")

✅ Datos cargados: 820834 filas, 68 columnas.


In [2]:
random_state = 42

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820834 entries, 0 to 820833
Data columns (total 68 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Date                         820503 non-null  object
 1   Timestamp                    820537 non-null  object
 2   Scr_IP                       820834 non-null  object
 3   Scr_port                     820834 non-null  object
 4   Des_IP                       820834 non-null  object
 5   Des_port                     820834 non-null  object
 6   Protocol                     820834 non-null  object
 7   Service                      820834 non-null  object
 8   Duration                     820834 non-null  object
 9   Scr_bytes                    820834 non-null  object
 10  Des_bytes                    820834 non-null  object
 11  Conn_state                   820834 non-null  int64 
 12  missed_bytes                 820834 non-null  object
 13  is_syn_only   

In [4]:
from sklearn.model_selection import train_test_split

"""Divide los datos en entrenamiento, validación y prueba."""
X = df.drop(columns=['class1', 'class2', 'class3'])
y_class3 = df['class3'].map({'Normal': 0, 'Attack': 1})
y_class2 = df['class2']
y_class1 = df['class1']

X_train, X_temp, y_train_class3, y_temp_class3, y_train_class2, y_temp_class2, y_train_class1, y_temp_class1 = train_test_split(
    X, y_class3, y_class2, y_class1, test_size=0.3, random_state=random_state, stratify=y_class3, shuffle=True
)

X_val, X_test, y_val_class3, y_test_class3, y_val_class2, y_test_class2, y_val_class1, y_test_class1 = train_test_split(
    X_temp, y_temp_class3, y_temp_class2, y_temp_class1, test_size=0.5, random_state=random_state, stratify=y_temp_class3, shuffle=True
)

# Resetear índices para evitar desalineaciones
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)  # Opcional

y_train_class3 = y_train_class3.reset_index(drop=True)
y_val_class3 = y_val_class3.reset_index(drop=True)
y_test_class3 = y_test_class3.reset_index(drop=True)  # Opcional

y_train_class2 = y_train_class2.reset_index(drop=True)
y_val_class2 = y_val_class2.reset_index(drop=True)
y_test_class2 = y_test_class2.reset_index(drop=True)  # Opcional

y_train_class1 = y_train_class1.reset_index(drop=True)
y_val_class1 = y_val_class1.reset_index(drop=True)
y_test_class1 = y_test_class1.reset_index(drop=True)  # Opcional

In [5]:
import pandas as pd
def fix_dtype(df, umbral_numerico=0.7):
    object_cols = df.select_dtypes(include=['object']).columns
    int_cols = df.select_dtypes(include=['int64']).columns
    bool_cols = df.select_dtypes(include=['bool']).columns

    # Convertir booleanos a float
    df[bool_cols] = df[bool_cols].astype(float)

    for col in object_cols:
        valores_unicos = df[col].dropna().unique()

        if {"true", "false"} <= set(valores_unicos):  # Verifica si ambos existen
            df[col] = df[col].map({'true': 1, 'false': 0}).astype(float)
        else:
            converted = pd.to_numeric(df[col], errors='coerce')
            if converted.notna().mean() > umbral_numerico:
                df[col] = converted.astype(float)

    for col in int_cols:
        df[col] = df[col].astype(float)

    return df

def delete_ip_port(df):
    """Elimina las columnas 'ip' y 'port'."""
    lista = ['Scr_IP', 'Scr_port', 'Des_IP', 'Des_port', 'Scr_bytes', 'Des_bytes', 'Scr_pkts', 
                            'Des_pkts', 'Scr_ip_bytes', 'Des_ip_bytes', 'Scr_packts_ratio', 'Des_pkts_ratio',
                            'Scr_bytes_ratio', 'Des_bytes_ratio']

    return df.drop(columns=lista)

In [6]:
import numpy as np
# Reemplazos comunes de valores
common_replacements = {
    '-': np.nan,
    '?': np.nan,
    'nan': np.nan,
}

def replace_common_values(df):
    """Reemplaza valores comunes como '-', '?' y 'nan' por NaN."""
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].replace(common_replacements)
    return df

def fix_mayus(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lower()
    return df


In [7]:
from sklearn.impute import KNNImputer, SimpleImputer

# Definir los imputadores
imputers = {
    'categorical': {
        'most_frequent': SimpleImputer(strategy='most_frequent'),
        'knn': KNNImputer(n_neighbors=5),
        'constant': SimpleImputer(strategy='constant', fill_value='missing')
    },
    'numeric': {
        'mean': SimpleImputer(strategy='mean'),
        'median': SimpleImputer(strategy='median')
    }
}

from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import RobustScaler

# Definir escaladores
scalers = {
    "robust": RobustScaler(),
    "standard": StandardScaler()
}

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Definir codificadores
encoders = {
    "one_hot": OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
    "ordinal": OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
}

In [8]:
imputador_cat= imputers['categorical']['constant']
imputador_num = imputers['numeric']['mean']
normalizacion = scalers['robust']
decodificador = encoders['one_hot']

In [9]:
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier


def matriz_correlacion(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    correlation_matrix = df[numeric_cols].corr()
    return correlation_matrix

def correlacion_pares(df, umbral):
    df = matriz_correlacion(df)
    # Toma solo la parte superior de la matriz para evitar duplicados
    upper_tri = df.where(np.triu(np.ones(df.shape), k=1).astype(bool))

    # Identifica pares altamente correlacionados
    correlated_pairs = []
    for col in upper_tri.columns:
        for row in upper_tri.index:
            if upper_tri.loc[row, col] > umbral:
                correlated_pairs.append((row, col))

    # Selecciona las columnas a eliminar (de cada par, se elimina la que aparece como columna)
    alta_corr_pares = [col for col in upper_tri.columns if any(upper_tri[col] > umbral)]
    
    return alta_corr_pares

def correlacion_respecto_objetivo(df, target, umbral):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    # Calculamos la correlación con la variable objetivo
    target_correlation = df[numeric_cols].corrwith(target).abs().sort_values(ascending=True)

    # Nos quedamos solo con las características que tengan correlación >= 0.1
    baja_corr_respecto_obj = target_correlation[target_correlation < umbral].index.tolist()

    return baja_corr_respecto_obj

def seleccionar_variables_pca(X_train, X_val, n_components, num_top_features):
    """
    Aplica PCA para seleccionar las características más influyentes, pero mantiene los datos originales.
    
    Parámetros:
        - X_train: DataFrame de entrenamiento
        - X_val: DataFrame de validación
        - n_components: float/int, cantidad de componentes principales o porcentaje de varianza a retener
        - num_top_features: int, número de características más influyentes a seleccionar

    Retorna:
        - X_train_filtrado: DataFrame de entrenamiento con las características seleccionadas
        - X_val_filtrado: DataFrame de validación con las características seleccionadas
    """

    # Si usas el sample, cambialo en las lineas necesarias
    # X_train_sample = X_train.sample(n=300000, random_state=42) # Seleccionar una muestra de 300,000 instancias como en el articulo
    
    # Aplicar PCA (sin guardar la transformación)
    pca = PCA(n_components=n_components, random_state=42)
    pca.fit(X_train)  # Solo ajustamos el modelo, no transformamos los datos

    # Obtener nombres originales de las variables
    original_feature_names = np.array(X_train.columns)

    # Contador de importancia de características en PCA
    feature_counter = Counter()
    
    for comp in pca.components_:
        top_indices = np.argsort(np.abs(comp))[-num_top_features:]  # Índices de las más importantes
        top_features = original_feature_names[top_indices]  # Obtener nombres
        feature_counter.update(top_features)  # Contar ocurrencias

    # Seleccionar las variables más influyentes ordenadas por frecuencia de aparición
    variables_pca = [feature for feature, _ in feature_counter.most_common()]

    # Filtrar las variables seleccionadas en los conjuntos de datos
    X_train_filtrado = X_train[variables_pca]
    X_val_filtrado = X_val[variables_pca]
    
    return X_train_filtrado, X_val_filtrado

def seleccionar_variables_rfe(X_train, X_val, y_train, num_features):
    # Modelo base para RFE
    modelo_rf = DecisionTreeClassifier(random_state=42)

    # Aplicar RFE para seleccionar las 20 mejores características
    rfe = RFE(estimator=modelo_rf, n_features_to_select=num_features, step=3)
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_val_rfe = rfe.transform(X_val)

    # Ver qué variables fueron seleccionadas
    selected_features = X_train.columns[rfe.support_]
    
    X_train_filtrado = X_train[selected_features]
    X_val_filtrado = X_val[selected_features]
    
    return X_train_filtrado, X_val_filtrado


def seleccionar_variables_randomForest(X_train, X_val, y_train, sample_weight_train, num_features):
    # Entrenar el modelo RandomForest con los pesos
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train, sample_weight=sample_weight_train)

    # Obtener importancia de características
    feature_importances = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': rf.feature_importances_
    }).sort_values(by='Importance', ascending=False)

    caracteristicas_imp_rf = feature_importances.head(num_features) 
    caracteristicas_imp_rf = caracteristicas_imp_rf.Feature.to_list()
    # print(caracteristicas_imp_rf)

    X_train_processed = X_train[caracteristicas_imp_rf]
    X_val_processed = X_val[caracteristicas_imp_rf]
    
    X_train_processed.shape
        
    return X_train_processed, X_val_processed


def proyectar_tsne(X_train, X_val, n_components, perplexity, max_iter, sample_size, random_state):
    """
    Aplica T-SNE para proyectar los datos a un espacio de menor dimensión.
    
    Parámetros:
        - X_train: DataFrame de entrenamiento
        - X_val: DataFrame de validación
        - n_components: int, número de dimensiones del embedding
        - perplexity: float, parámetro de T-SNE
        - n_iter: int, número de iteraciones
        - sample_size: int, cantidad de instancias a muestrear para T-SNE
        - random_state: int, semilla para reproducibilidad

    Retorna:
        - X_train_tsne: DataFrame de entrenamiento con la proyección T-SNE
        - X_val_tsne: DataFrame de validación con la proyección T-SNE
    """
    # Concatenar datos y muestrear si es necesario
    X_all = pd.concat([X_train, X_val])
    if X_all.shape[0] > sample_size:
        X_all_sample = X_all.sample(n=sample_size, random_state=random_state)
    else:
        X_all_sample = X_all.copy()

    # Aplicar T-SNE
    tsne = TSNE(n_components=n_components, perplexity=perplexity, max_iter=max_iter, random_state=random_state)
    X_all_tsne = tsne.fit_transform(X_all_sample)
    df_tsne = pd.DataFrame(X_all_tsne, index=X_all_sample.index, columns=[f'tsne_{i+1}' for i in range(n_components)])
    
    # Separar proyección en conjuntos de entrenamiento y validación
    X_train_tsne = df_tsne.loc[df_tsne.index.intersection(X_train.index)]
    X_val_tsne = df_tsne.loc[df_tsne.index.intersection(X_val.index)]
    
    
    return X_train_tsne, X_val_tsne

In [10]:
def calculo_varianza(df):
    """Calcula las varianzas de las columnas de un DataFrame y devuelve las que tienen varianza igual a cero."""
    varianzas = df.var()

    # Identificar columnas con varianza igual a cero
    variables_con_varianza_cero = [col for col, varianza in varianzas.items() if varianza == 0]
    
    return variables_con_varianza_cero


In [11]:
X_train = replace_common_values(X_train)
X_train = fix_mayus(X_train)
X_train = fix_dtype(X_train)
X_train = delete_ip_port(X_train)

y_train_class3 = y_train_class3.loc[X_train.index]
y_train_class2 = y_train_class2.loc[X_train.index]
y_train_class1 = y_train_class1.loc[X_train.index]

X_val = replace_common_values(X_val)
X_val = fix_mayus(X_val)
X_val = fix_dtype(X_val)
X_val = delete_ip_port(X_val)

y_val_class3 = y_val_class3.loc[X_val.index]
y_val_class2 = y_val_class2.loc[X_val.index]
y_val_class1 = y_val_class1.loc[X_val.index]

In [12]:
X_train['Instancia_completa'] = X_train.notnull().all(axis=1).astype(int)
X_val['Instancia_completa'] = X_val.notnull().all(axis=1).astype(int)

completas = X_train['Instancia_completa'].sum()
incompletas = len(X_train) - completas
print(f"✅ Instancias completas: {completas}, incompletas: {incompletas}")
sample_weight_train = X_train['Instancia_completa'].replace({1: 3, 0: 1})

columnas_no_comprobar = [col for col in X_train.columns if col not in ['Timestamp', 'Date', 'Instancia_completa'] and X_train[col].dtypes != 'object']
variables_con_varianza_cero = calculo_varianza(X_train[columnas_no_comprobar])
X_train = X_train.drop(columns=variables_con_varianza_cero)
X_val = X_val.drop(columns=variables_con_varianza_cero)
    
X_train = X_train.drop(columns=['Timestamp', 'Date', 'Instancia_completa'], errors='ignore')
X_val = X_val.drop(columns=['Timestamp', 'Date', 'Instancia_completa'], errors='ignore')

alta_corr_pares = correlacion_pares(X_train, 0.97)
X_train = X_train.drop(columns=alta_corr_pares)
X_val = X_val.drop(columns=alta_corr_pares)

baja_corr_respecto_obj = correlacion_respecto_objetivo(X_train, y_train_class3, 0.025)
X_train = X_train.drop(columns=baja_corr_respecto_obj)
X_val = X_val.drop(columns=baja_corr_respecto_obj)

caracteritisticas_seleccionadas = X_train.columns.tolist()

X_train['Protocol'] = X_train['Protocol'].fillna("missing")
X_val['Protocol'] = X_val['Protocol'].fillna("missing")

✅ Instancias completas: 416933, incompletas: 157650


In [13]:
 # Identificar columnas categóricas, numéricas y booleanas
categorical_cols = X_train.select_dtypes(include=['object']).columns
boolean_cols = X_train.select_dtypes(include=['bool']).columns
if boolean_cols.any():  # Si hay columnas booleanas
    X_train[boolean_cols] = X_train[boolean_cols].astype(float)  # TAL VEZ INNCESESARIO
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

##############################################################################
    
X_train[categorical_cols] = imputador_cat.fit_transform(X_train[categorical_cols])
X_val[categorical_cols] = imputador_cat.transform(X_val[categorical_cols])

X_train[numerical_cols] = imputador_num.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = imputador_num.transform(X_val[numerical_cols])

##############################################################################

X_train_scaled = normalizacion.fit_transform(X_train[numerical_cols])
X_val_scaled = normalizacion.transform(X_val[numerical_cols])

# Convertir las matrices escaladas a DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_train.index)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_val.index)

##############################################################################

X_train_encoded = decodificador.fit_transform(X_train[categorical_cols])
X_val_encoded = decodificador.transform(X_val[categorical_cols])

# Obtener los nombres de las nuevas columnas codificadas
encoded_cols = decodificador.get_feature_names_out(categorical_cols)

# Convertir las matrices codificadas a DataFrames
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_cols, index=X_train.index)
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=encoded_cols, index=X_val.index)

##############################################################################

# Combinar con las características categóricas codificadas
X_train_processed = pd.concat([X_train_scaled_df, X_train_encoded_df], axis=1)
X_val_processed = pd.concat([X_val_scaled_df, X_val_encoded_df], axis=1)

# Opcional: Reordenar las columnas si es necesario
X_train = X_train_processed.reindex(sorted(X_train_processed.columns), axis=1)
X_val = X_val_processed.reindex(sorted(X_val_processed.columns), axis=1)

In [14]:
X_train, X_val = seleccionar_variables_pca(X_train_processed, X_val_processed, n_components=0.95, num_top_features=20)
caracteritisticas_procesadas = X_train.columns.tolist()

In [15]:
print(X_train.shape, X_val.shape)
print(X_train.isnull().sum().sum(), X_val.isnull().sum().sum())

(574583, 34) (123125, 34)
0 0


In [16]:
from sklearn.preprocessing import LabelEncoder

label_encoder_class1 = LabelEncoder()
y_train_class1 = label_encoder_class1.fit_transform(y_train_class1)
y_val_class1   = label_encoder_class1.transform(y_val_class1)
y_test_class1  = label_encoder_class1.transform(y_test_class1)

mapping_class1 = dict(enumerate(label_encoder_class1.classes_))


In [17]:
print(mapping_class1)

{0: 'BruteForce', 1: 'C&C', 2: 'Dictionary', 3: 'Discovering_resources', 4: 'Exfiltration', 5: 'Fake_notification', 6: 'False_data_injection', 7: 'Generic_scanning', 8: 'MQTT_cloud_broker_subscription', 9: 'MitM', 10: 'Modbus_register_reading', 11: 'Normal', 12: 'RDOS', 13: 'Reverse_shell', 14: 'Scanning_vulnerability', 15: 'TCP Relay', 16: 'crypto-ransomware', 17: 'fuzzing', 18: 'insider_malcious'}


In [18]:
input_dim = X_train.shape[1]  # Número de características de entrada
num_classes_1 = len(set(y_train_class1))  # Cantidad de clases en y_class1

In [19]:
y_train_class1 = y_train_class1.ravel()
y_val_class1 = y_val_class1.ravel()

In [20]:
from tensorflow import keras
from tensorflow.keras import layers

# Entrada
input_layer = keras.Input(shape=(input_dim,))

# Capas ocultas
x = layers.Dense(200, activation='relu')(input_layer)
x = layers.Dense(200, activation='relu')(x)
x = layers.Dense(200, activation='relu')(x)

# Salida
output_class1 = layers.Dense(num_classes_1, activation='softmax', name="output_class1")(x)  # Multiclase

# Modelo con dos salidas
model = keras.Model(inputs=input_layer, outputs=[output_class1])

# Compilar el modelo
model.compile(optimizer=keras.optimizers.RMSprop(),
              loss=['sparse_categorical_crossentropy'],
              metrics=['accuracy'])

# Entrenar el modelo
history = model.fit(X_train,
                    y_train_class1,
                    batch_size=250,
                    epochs=10,
                    validation_data=(X_val, y_val_class1))


# Guardar el modelo con un nombre único
model_name = f"dnn_class1"

# Crear la carpeta si no existe
carpeta_modelos = "modelos_dnn_opcion2"
os.makedirs(carpeta_modelos, exist_ok=True)

# Guardar el modelo dentro de la carpeta
ruta_modelo = os.path.join(carpeta_modelos, f"{model_name}.h5")
model.save(ruta_modelo)

print(f"Modelo guardado en: {ruta_modelo}")


Epoch 1/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.9426 - loss: 0.2364 - val_accuracy: 0.9831 - val_loss: 0.0550
Epoch 2/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9826 - loss: 0.0559 - val_accuracy: 0.9848 - val_loss: 0.0479
Epoch 3/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9849 - loss: 0.0460 - val_accuracy: 0.9827 - val_loss: 0.0505
Epoch 4/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9857 - loss: 0.0428 - val_accuracy: 0.9863 - val_loss: 0.0441
Epoch 5/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9870 - loss: 0.0395 - val_accuracy: 0.9872 - val_loss: 0.0389
Epoch 6/10
[1m2299/2299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9878 - loss: 0.0375 - val_accuracy: 0.9873 - val_loss: 0.0379
Epoch 7/10
[



Modelo guardado en: modelos_dnn_opcion2\dnn_class1.h5


In [21]:
X_test = replace_common_values(X_test)
X_test = fix_mayus(X_test)
X_test = fix_dtype(X_test)
X_test = delete_ip_port(X_test)

y_test_class3 = y_test_class3[X_test.index]

X_test = X_test[caracteritisticas_seleccionadas] 

X_test['Protocol'] = X_test['Protocol'].fillna("missing")

In [22]:
# Identificar columnas categóricas, numéricas y booleanas
categorical_cols = X_test.select_dtypes(include=['object']).columns
boolean_cols = X_test.select_dtypes(include=['bool']).columns
if boolean_cols.any():  # Si hay columnas booleanas
    X_test[boolean_cols] = X_test[boolean_cols].astype(float) # TAL VEZ INNCESESARIO
numerical_cols = X_test.select_dtypes(include=['float64', 'int64']).columns

##############################################################################

X_test[categorical_cols] = imputador_cat.transform(X_test[categorical_cols])

X_test[numerical_cols] = imputador_num.transform(X_test[numerical_cols])

##############################################################################

X_test_scaled = normalizacion.transform(X_test[numerical_cols])

# Convertir las matrices escaladas a DataFrames
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=[f"{col}_scaled" for col in numerical_cols], index=X_test.index)

##############################################################################

X_test_encoded = decodificador.transform(X_test[categorical_cols])

# Obtener los nombres de las nuevas columnas codificadas
encoded_cols = decodificador.get_feature_names_out(categorical_cols)

# Convertir las matrices codificadas a DataFrames
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_cols, index=X_test.index)

##############################################################################

# Combinar con las características categóricas codificadas
X_test_processed = pd.concat([X_test_scaled_df, X_test_encoded_df], axis=1)

# Opcional: Reordenar las columnas si es necesario
X_test_processed = X_test_processed.reindex(sorted(X_test_processed.columns), axis=1)

X_test = X_test_processed[caracteritisticas_procesadas]

Clase1 Mapping: {0: 'BruteForce', 1: 'C&C', 2: 'Dictionary', 3: 'Discovering_resources', 4: 'Exfiltration', 5: 'Fake_notification', 6: 'False_data_injection', 7: 'Generic_scanning', 8: 'MQTT_cloud_broker_subscription', 9: 'MitM', 10: 'Modbus_register_reading', 11: 'RDOS', 12: 'Reverse_shell', 13: 'Scanning_vulnerability', 14: 'TCP Relay', 15: 'crypto-ransomware', 16: 'fuzzing', 17: 'insider_malcious'}

In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score

y_test_class1 = y_test_class1.ravel()

# Predicción para salida de clase3
y_pred_class1 = model.predict(X_test)
y_pred_class1 = np.argmax(y_pred_class1, axis=1) 
y_pred_class1_names = label_encoder_class1.inverse_transform(y_pred_class1)

predicciones_df = pd.DataFrame({'Predicciones_Class1': y_pred_class1_names.ravel()})  
ruta_directorio = '../../predicciones'
os.makedirs(ruta_directorio, exist_ok=True)
predicciones_df.to_csv(os.path.join(ruta_directorio, 'arq2.csv'), index=False)

accuracy = accuracy_score(y_test_class1, y_pred_class1)
print(f"📈 Accuracy (Test): {accuracy:.4f}")
precision = precision_score(y_test_class1, y_pred_class1, average='macro')
print(f"📈 Precision (Test): {precision:.4f}")
recall = recall_score(y_test_class1, y_pred_class1, average='macro')
print(f"📈 Recall (Test): {recall:.4f}")
f1 = f1_score(y_test_class1, y_pred_class1, average='macro')
print(f"📈 F1 (Test): {f1:.4f}")

[1m3848/3848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
📈 Accuracy (Test): 0.9894
📈 Precision (Test): 0.9589
📈 Recall (Test): 0.9167
📈 F1 (Test): 0.9347


                                BruteForce  C&C  Dictionary  Discovering_resources  ...  TCP Relay  crypto-ransomware  fuzzing  insider_malcious
BruteForce                            7152    0           0                      0  ...          0                  0        0                 0
C&C                                      0  372           0                      1  ...          0                  0        0                 0
Dictionary                               0    0         375                      0  ...          0                  0        0                 0
Discovering_resources                    0    2           0                   2897  ...          0                  0        0                 0
Exfiltration                             0    1           1                      0  ...          0                  0        0                 0
Fake_notification                        0    0           0                      0  ...          0                  0        0                 0
False_data_injection                     0    0           0                      0  ...          0                  0        0                 0
Generic_scanning                         0    0           0                      0  ...          1                  0        0                 0
MQTT_cloud_broker_subscription           0    0           0                      0  ...          1                  0        0                 0
MitM                                     0    0           0                      0  ...          0                  0        0                 0
Modbus_register_reading                  0    0           0                      0  ...          0                  0        0                 0
Normal                                   0   60           0                    236  ...         46                  1       12                 3
RDOS                                     0    0           0                      4  ...          0                  0        0                 0
Reverse_shell                            0    0           0                      0  ...          9                  0        0                 0
Scanning_vulnerability                   0    0           5                      0  ...          2                  0        0                 0
TCP Relay                                0    0           0                      0  ...        234                  0        0                 0
crypto-ransomware                        0    0           0                      0  ...          0                 72        0                 0
fuzzing                                  0    0           0                      0  ...          0                  0      137                 0
insider_malcious                         0    0           0                      0  ...          0                  0        0              2638

[19 rows x 19 columns]
                                precision    recall  f1-score   support

                    BruteForce     1.0000    1.0000    1.0000      7152
                           C&C     0.8552    0.8513    0.8532       437
                    Dictionary     0.9843    0.9973    0.9908       376
         Discovering_resources     0.9232    0.8404    0.8799      3447
                  Exfiltration     0.9997    0.9988    0.9992      3291
             Fake_notification     1.0000    0.6667    0.8000         3
          False_data_injection     0.9540    0.9947    0.9739       751
              Generic_scanning     0.9966    0.9978    0.9972      7559
MQTT_cloud_broker_subscription     0.9958    0.9938    0.9948      3546
                          MitM     0.9333    0.8750    0.9032        16
       Modbus_register_reading     0.9979    0.9968    0.9973       933
                        Normal     0.9875    0.9933    0.9904     63213
                          RDOS     0.9997    0.9993    0.9995     21095
                 Reverse_shell     0.8905    0.8356    0.8622       146
        Scanning_vulnerability     0.9980    0.9984    0.9982      7921
                     TCP Relay     0.7986    0.7178    0.7561       326
             crypto-ransomware     0.9863    0.9863    0.9863        73
                       fuzzing     0.9195    0.6749    0.7784       203
              insider_malcious     0.9989    1.0000    0.9994      2638

                      accuracy                         0.9894    123126
                     macro avg     0.9589    0.9167    0.9347    123126
                  weighted avg     0.9892    0.9894    0.9892    123126






                   C&C  Exfiltration  Exploitation  Lateral _movement  Normal   RDOS  Reconnaissance  Tampering  Weaponization  crypto-ransomware
C&C                372             0             0                  0      64      0               1          0              0                  0
Exfiltration         1          3287             0                  0       0      0               2          0              1                  0
Exploitation         0             0           136                 12       4      0              10          0              0                  0
Lateral _movement    0             0             8               4689      90      0               8         10              0                  0
Normal              60             0             8                 51   62791      7             266         26              3                  1
RDOS                 0             0             0                  0      10  21081               4          0              0                  0
Reconnaissance       2             1             0                  8     627      0           18487          0              5                  0
Tampering            0             0             0                  4       1      0               0        749              0                  0
Weaponization        0             0             0                  0       0      0               1          0          10165                  0
crypto-ransomware    0             0             0                  0       1      0               0          0              0                 72
                   precision    recall  f1-score   support

              C&C     0.8552    0.8513    0.8532       437
     Exfiltration     0.9997    0.9988    0.9992      3291
     Exploitation     0.8947    0.8395    0.8662       162
Lateral _movement     0.9843    0.9759    0.9800      4805
           Normal     0.9875    0.9933    0.9904     63213
             RDOS     0.9997    0.9993    0.9995     21095
   Reconnaissance     0.9845    0.9664    0.9753     19130
        Tampering     0.9541    0.9934    0.9734       754
    Weaponization     0.9991    0.9999    0.9995     10166
crypto-ransomware     0.9863    0.9863    0.9863        73

         accuracy                         0.9895    123126
        macro avg     0.9645    0.9604    0.9623    123126
     weighted avg     0.9895    0.9895    0.9894    123126