In [1]:
from google.colab import drive  # Importing the library to mount Google Drive
drive.mount('/content/drive')  # Mounting Google Drive in Colab environment

Mounted at /content/drive


In [3]:
%%capture
!pip install pandas numpy scikit-learn nltk

In [17]:
import pandas as pd

# File paths
train_df_file = "/content/drive/My Drive/MOE_DGA/train_wl.csv"

train_df = pd.read_csv(train_df_file)

#train_df = train_df.rename(columns={"label": "Label"})


print(train_df)

                       domain    family   label
0         nailconsiderable.ru  suppobox     dga
1            stilldelight.net  suppobox     dga
2       kimberleekatheryn.net  suppobox     dga
3                soilbeen.net  suppobox     dga
4               visitform.net  suppobox     dga
...                       ...       ...     ...
159995             dhuhaa.com     legit  notdga
159996        sdmetalcrew.org     legit  notdga
159997  melbcampcontuligol.ga     legit  notdga
159998      pl-enthusiast.net     legit  notdga
159999            rd-forum.ru     legit  notdga

[160000 rows x 3 columns]


In [18]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import nltk
from nltk.corpus import words, wordnet
from collections import defaultdict

# Descargar recursos de NLTK si no están disponibles
try:
    nltk.data.find('corpora/words')
except LookupError:
    nltk.download('words')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Construir diccionarios más completos
def build_comprehensive_dictionaries():
    """
    Construye diccionarios más completos basados en NLTK y listas comunes
    """
    # Diccionario inglés completo (equivalente a los 58k palabras mencionadas)
    english_words = set(words.words())
    english_dict = {word.lower() for word in english_words if len(word) >= 2}

    # Palabras comunes más frecuentes para sustantivos, verbos y adjetivos
    common_nouns = {
        "time", "year", "people", "way", "day", "man", "thing", "woman", "life",
        "child", "world", "school", "state", "family", "student", "group", "country",
        "problem", "hand", "part", "place", "case", "week", "company", "system",
        "program", "question", "work", "government", "number", "night", "point",
        "home", "water", "room", "mother", "area", "money", "story", "fact",
        "month", "lot", "right", "study", "book", "eye", "job", "word", "business",
        "issue", "side", "kind", "head", "house", "service", "friend", "father",
        "power", "hour", "game", "line", "end", "member", "law", "car", "city",
        "community", "name", "president", "team", "minute", "idea", "kid", "body",
        "information", "back", "parent", "face", "others", "level", "office",
        "door", "health", "person", "art", "war", "history", "party", "result",
        "change", "morning", "reason", "research", "girl", "guy", "moment", "air",
        "teacher", "force", "education"
    }

    common_verbs = {
        "be", "have", "do", "say", "get", "make", "go", "know", "take", "see",
        "come", "think", "look", "want", "give", "use", "find", "tell", "ask",
        "work", "seem", "feel", "try", "leave", "call", "need", "move", "would",
        "could", "should", "might", "will", "can", "must", "shall", "may",
        "put", "mean", "keep", "let", "begin", "seem", "help", "talk", "turn",
        "start", "show", "hear", "play", "run", "move", "like", "live", "believe",
        "hold", "bring", "happen", "write", "provide", "sit", "stand", "lose",
        "pay", "meet", "include", "continue", "set", "learn", "change", "lead",
        "understand", "watch", "follow", "stop", "create", "speak", "read",
        "allow", "add", "spend", "grow", "open", "walk", "win", "offer",
        "remember", "love", "consider", "appear", "buy", "wait", "serve",
        "die", "send", "expect", "build", "stay", "fall", "cut", "reach", "kill",
        "remain", "suggest", "raise", "pass", "sell", "require", "report"
    }

    common_adjectives = {
        "good", "new", "first", "last", "long", "great", "little", "own", "other",
        "old", "right", "big", "high", "different", "small", "large", "next",
        "early", "young", "important", "few", "public", "bad", "same", "able",
        "local", "sure", "united", "real", "best", "better", "less", "far",
        "much", "water", "very", "social", "only", "national", "political",
        "special", "hard", "international", "health", "human", "common", "short",
        "general", "strong", "particular", "community", "whole", "private",
        "recent", "available", "major", "personal", "current", "left", "least",
        "possible", "business", "economic", "white", "late", "difficult", "red",
        "close", "fine", "higher", "western", "financial", "certain", "free",
        "military", "original", "successful", "low", "activity", "critical",
        "environmental", "global", "eastern", "hard", "popular", "traditional",
        "main", "simple", "physical", "medical", "full", "federal", "blue",
        "democratic", "dark", "various", "entire", "close", "legal", "religious",
        "cold", "final", "main", "green", "nice", "huge", "popular", "serious",
        "ready", "easy", "official", "foreign", "fine", "civil", "lower"
    }

    return english_dict, common_nouns, common_verbs, common_adjectives

# Construir diccionarios DGA y privados desde dominios DGA
def build_dga_dicts(df, english_dict):
    """
    Construye diccionarios DGA y privados a partir de los dominios DGA del dataset
    """
    if 'label' not in df.columns or 'domain' not in df.columns:
        raise ValueError("DataFrame debe tener columnas 'label' y 'domain'")

    dga_domains = df[df['label'] == 'dga']['domain']
    dga_words = set()
    private_words = set()

    for domain in dga_domains:
        # Separar por puntos y guiones
        parts = re.split(r'[-.]', domain.lower())
        for word in parts:
            if word and len(word) >= 2:  # Ignorar partes muy cortas
                dga_words.add(word)
                # Si la palabra no está en el diccionario inglés, es "privada"
                if word not in english_dict:
                    private_words.add(word)

    return dga_words, private_words

# Extraer características mejoradas del dominio
def extract_features(domain, dga_dict, private_dict, english_dict, noun_dict, verb_dict, adj_dict):
    """
    Extrae las 16 características mencionadas en el paper
    """
    domain = domain.lower()
    vowels = "aeiou"
    digits_and_dash = string.digits + "-"

    # f1: Longitud del dominio
    domain_len = len(domain)

    # f2: Suma ASCII de todos los caracteres
    ascii_sum = sum(ord(c) for c in domain)

    # f3: Número de vocales
    vowel_count = sum(1 for c in domain if c in vowels)

    # f4: Distribución de vocales
    vowel_dist = vowel_count / domain_len if domain_len > 0 else 0

    # f5: Número de dígitos y guiones
    digit_dash_count = sum(1 for c in domain if c in digits_and_dash)

    # f6: Distribución de dígitos y guiones
    digit_dash_dist = digit_dash_count / domain_len if domain_len > 0 else 0

    # Extraer palabras del dominio (separadas por . y -)
    parts = re.split(r'[-.]', domain)
    words = [w for w in parts if w and len(w) >= 2]

    # f7: Palabras en diccionario inglés
    word_norm = sum(1 for w in words if w in english_dict)

    # f8: Palabras en diccionario DGA
    word_dga = sum(1 for w in words if w in dga_dict)

    # f9: Sustantivos
    noun_count = sum(1 for w in words if w in noun_dict)

    # f10: Verbos
    verb_count = sum(1 for w in words if w in verb_dict)

    # f11: Adjetivos
    adj_count = sum(1 for w in words if w in adj_dict)

    # f12: Palabras privadas (DGA que no están en inglés)
    private_count = sum(1 for w in words if w in private_dict)

    # f13: Ratio entre palabras DGA y palabras normales
    ratio_dga_norm = word_dga / word_norm if word_norm > 0 else (word_dga if word_dga > 0 else 0)

    # f14: Longitud de la palabra más larga
    word_lengths = [len(w) for w in words]
    max_len_word = max(word_lengths) if word_lengths else 0

    # f15: Longitud de la palabra más corta
    min_len_word = min(word_lengths) if word_lengths else 0

    # f16: Ratio entre caracteres de palabras y longitud total
    total_word_chars = sum(word_lengths)
    word_char_ratio = total_word_chars / domain_len if domain_len > 0 else 0

    return [
        domain_len, ascii_sum, vowel_count, vowel_dist,
        digit_dash_count, digit_dash_dist, word_norm, word_dga,
        noun_count, verb_count, adj_count, private_count,
        ratio_dga_norm, max_len_word, min_len_word, word_char_ratio
    ]

def train_dga_classifier(df, test_size=0.01, random_state=42):
    """
    Función principal para entrenar el clasificador DGA
    """
    print("Construyendo diccionarios...")
    english_dict, noun_dict, verb_dict, adj_dict = build_comprehensive_dictionaries()

    print(f"Diccionario inglés: {len(english_dict)} palabras")
    print(f"Sustantivos: {len(noun_dict)} palabras")
    print(f"Verbos: {len(verb_dict)} palabras")
    print(f"Adjetivos: {len(adj_dict)} palabras")

    print("Construyendo diccionarios DGA...")
    dga_dict, private_dict = build_dga_dicts(df, english_dict)

    print(f"Palabras DGA: {len(dga_dict)}")
    print(f"Palabras privadas: {len(private_dict)}")

    print("Extrayendo características...")
    # Extraer características
    features = df['domain'].apply(
        lambda d: extract_features(d, dga_dict, private_dict, english_dict,
                                 noun_dict, verb_dict, adj_dict)
    )

    X = np.array(features.tolist())
    y = df['label'].map({'notdga': 0, 'dga': 1}).values

    print(f"Forma de X: {X.shape}")
    print(f"Distribución de clases: {np.bincount(y)}")

    # División train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    print("Entrenando modelo Random Forest...")
    # Entrenar modelo Random Forest (como en el paper)
    model = RandomForestClassifier(
        n_estimators=100,  # Aumentado para mejor rendimiento
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=random_state,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    # Predicción y evaluación
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"\nAccuracy: {accuracy:.4f}")
    print("\nReporte de clasificación:")
    print(classification_report(y_test, y_pred, target_names=["notdga", "dga"]))

    print("\nMatriz de confusión:")
    print(confusion_matrix(y_test, y_pred))

    # Importancia de características
    feature_names = [
        'domain_len', 'ascii_sum', 'vowel_count', 'vowel_dist',
        'digit_dash_count', 'digit_dash_dist', 'word_norm', 'word_dga',
        'noun_count', 'verb_count', 'adj_count', 'private_count',
        'ratio_dga_norm', 'max_len_word', 'min_len_word', 'word_char_ratio'
    ]

    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nImportancia de características:")
    print(feature_importance)

    return model, (dga_dict, private_dict, english_dict, noun_dict, verb_dict, adj_dict)

# Función para clasificar nuevos dominios
def classify_domain(domain, model, dictionaries):
    """
    Clasifica un dominio individual
    """
    dga_dict, private_dict, english_dict, noun_dict, verb_dict, adj_dict = dictionaries

    features = extract_features(domain, dga_dict, private_dict, english_dict,
                              noun_dict, verb_dict, adj_dict)
    features_array = np.array([features])

    prediction = model.predict(features_array)[0]
    probability = model.predict_proba(features_array)[0]

    return {
        'domain': domain,
        'prediction': 'dga' if prediction == 1 else 'notdga',
        'dga_probability': probability[1],
        'notdga_probability': probability[0]
    }

# === EJEMPLO DE USO ===
if __name__ == "__main__":
    # Ejemplo de cómo usar el código
    # Asume que tienes un DataFrame 'df' con columnas 'domain' y 'label'

    # Crear datos de ejemplo si no tienes un dataset
    sample_data = {
        'domain': [
            'google.com', 'facebook.com', 'microsoft.com', 'amazon.com',
            'xkvbpqr.com', 'mnbvcxz.net', 'qwertyuiop.org', 'asdfghjkl.info',
            'randomstring123.com', 'anotherfakedom.net'
        ],
        'label': [
            'notdga', 'notdga', 'notdga', 'notdga',
            'dga', 'dga', 'dga', 'dga', 'dga', 'dga'
        ]
    }

    df_example = pd.DataFrame(sample_data)

    print("Ejemplo con datos sintéticos:")
    print("Para usar con tus datos reales, carga tu DataFrame con columnas 'domain' y 'label'")
    print("Donde 'label' contiene 'dga' o 'notdga'")

    model, dictionaries = train_dga_classifier(train_df)

    # Ejemplo de clasificación de un dominio individual
    # result = classify_domain('suspicious-domain.com', model, dictionaries)
    # print(f"\nResultado para 'suspicious-domain.com': {result}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Ejemplo con datos sintéticos:
Para usar con tus datos reales, carga tu DataFrame con columnas 'domain' y 'label'
Donde 'label' contiene 'dga' o 'notdga'
Construyendo diccionarios...
Diccionario inglés: 234351 palabras
Sustantivos: 100 palabras
Verbos: 106 palabras
Adjetivos: 110 palabras
Construyendo diccionarios DGA...
Palabras DGA: 77834
Palabras privadas: 74121
Extrayendo características...
Forma de X: (160000, 16)
Distribución de clases: [80000 80000]
Entrenando modelo Random Forest...

Accuracy: 0.9900

Reporte de clasificación:
              precision    recall  f1-score   support

      notdga       0.99      0.99      0.99       800
         dga       0.99      0.99      0.99       800

    accuracy                           0.99      1600
   macro avg       0.99      0.99      0.99      1600
weighted avg       0.99      0.99      0.99      1600


Matriz de confusión:
[[790  10]
 [  6 794]]

Importancia de características:
             feature  importance
12    ratio_dga_norm  

In [19]:
result = classify_domain('suspicious-domain.com', model, dictionaries)
print(f"\nResultado para 'suspicious-domain.com': {result}")


Resultado para 'suspicious-domain.com': {'domain': 'suspicious-domain.com', 'prediction': 'notdga', 'dga_probability': np.float64(0.019480393789305103), 'notdga_probability': np.float64(0.980519606210695)}


In [20]:
result['prediction']

'notdga'

In [21]:
import pandas as pd
import numpy as np
import time
import os
from pathlib import Path
import gzip

# Importar las funciones del modelo DGA que creamos anteriormente
# (Asume que ya tienes entrenado el modelo y las funciones disponibles)

def classify_domains_batch(domains, model, dictionaries):
    """
    Clasifica múltiples dominios y mide el tiempo de procesamiento

    Args:
        domains: Lista o array de dominios
        model: Modelo entrenado
        dictionaries: Tupla con los diccionarios necesarios

    Returns:
        dict con predicciones, probabilidades y tiempos
    """
    dga_dict, private_dict, english_dict, noun_dict, verb_dict, adj_dict = dictionaries

    predictions = []
    probabilities_dga = []
    probabilities_notdga = []
    processing_times = []

    for domain in domains:
        start_time = time.time()

        # Extraer características
        features = extract_features(domain, dga_dict, private_dict, english_dict,
                                  noun_dict, verb_dict, adj_dict)
        features_array = np.array([features])

        # Hacer predicción
        prediction = model.predict(features_array)[0]
        probability = model.predict_proba(features_array)[0]

        end_time = time.time()
        query_time = end_time - start_time

        predictions.append('dga' if prediction == 1 else 'notdga')
        probabilities_dga.append(probability[1])
        probabilities_notdga.append(probability[0])
        processing_times.append(query_time)

    return {
        'predictions': predictions,
        'probabilities_dga': probabilities_dga,
        'probabilities_notdga': probabilities_notdga,
        'processing_times': processing_times
    }

def evaluate_dga_families(model, dictionaries, base_path='/content/drive/My Drive/Familias_Test/',
                         results_path='/content/drive/My Drive/results/', runs=30, chunk_size=50):
    """
    Evalúa el modelo DGA con diferentes familias de malware

    Args:
        model: Modelo entrenado
        dictionaries: Diccionarios del modelo
        base_path: Ruta base donde están los archivos de familias
        results_path: Ruta donde guardar los resultados
        runs: Número de ejecuciones por familia
        chunk_size: Tamaño del chunk para procesar
    """

    families = [
        'matsnu.gz',
        'suppobox.gz',
        'charbot.gz',
        'gozi.gz',
        'manuelita.gz',
        'rovnix.gz',
        'deception.gz',
        'nymaim.gz'
    ]

    # Crear directorio de resultados si no existe
    Path(results_path).mkdir(parents=True, exist_ok=True)

    # Verificar que el archivo legit existe
    legit_file = os.path.join(base_path, 'legit.gz')
    if not os.path.exists(legit_file):
        raise FileNotFoundError(f"Archivo legit no encontrado: {legit_file}")

    # Procesar cada familia
    for family in families:
        print(f"🔍 Procesando familia: {family}")

        family_file = os.path.join(base_path, family)
        if not os.path.exists(family_file):
            print(f"❌ Archivo no encontrado: {family_file}")
            continue

        # Estadísticas para la familia
        family_stats = {
            'total_domains': 0,
            'total_time': 0,
            'avg_time_per_domain': 0,
            'runs_completed': 0
        }

        try:
            # Crear lectores de chunks
            dga_reader = pd.read_csv(family_file, chunksize=chunk_size)

            for run in range(runs):
                print(f" ▶️ Run {run+1}/{runs}", end="\r")

                try:
                    # Leer chunks
                    dga_chunk = next(dga_reader)

                    # Reiniciar el lector de legit para cada run
                    legit_reader = pd.read_csv(legit_file, chunksize=chunk_size)
                    legit_chunk = next(legit_reader)

                    # Combinar chunks
                    df_chunk = pd.concat([dga_chunk, legit_chunk]).reset_index(drop=True)

                    # Asegurar que la columna domain existe
                    if 'domain' not in df_chunk.columns:
                        # Si no existe, asumir que la primera columna son los dominios
                        df_chunk.columns = ['domain'] + list(df_chunk.columns[1:])

                    # Crear etiquetas si no existen
                    if 'label' not in df_chunk.columns:
                        # Primeros len(dga_chunk) son DGA, el resto son legit
                        labels = ['dga'] * len(dga_chunk) + ['notdga'] * len(legit_chunk)
                        df_chunk['label'] = labels

                    # Medir tiempo total para el batch
                    batch_start_time = time.time()

                    # Obtener predicciones y tiempos individuales
                    results = classify_domains_batch(df_chunk["domain"].values, model, dictionaries)

                    batch_end_time = time.time()
                    batch_total_time = batch_end_time - batch_start_time

                    # Agregar resultados al DataFrame
                    df_chunk["pred"] = results['predictions']
                    df_chunk["prob_dga"] = results['probabilities_dga']
                    df_chunk["prob_notdga"] = results['probabilities_notdga']
                    df_chunk["query_time"] = results['processing_times']
                    df_chunk["batch_time"] = batch_total_time
                    df_chunk["run"] = run

                    # Calcular métricas adicionales
                    df_chunk["correct"] = (df_chunk["label"] == df_chunk["pred"]).astype(int)

                    # Actualizar estadísticas
                    family_stats['total_domains'] += len(df_chunk)
                    family_stats['total_time'] += batch_total_time
                    family_stats['runs_completed'] += 1

                    # Guardar resultados
                    output_file = os.path.join(
                        results_path,
                        f"results_RandomForest_{family.replace('.gz', '')}_{run}.csv.gz"
                    )

                    df_chunk.to_csv(
                        output_file,
                        index=False,
                        compression="gzip"
                    )

                except StopIteration:
                    print(f"\n⚠️ No hay más datos disponibles para {family} en run {run+1}")
                    break
                except Exception as e:
                    print(f"\n❌ Error en run {run+1} para {family}: {str(e)}")
                    continue

            # Calcular estadísticas finales para la familia
            if family_stats['runs_completed'] > 0:
                family_stats['avg_time_per_domain'] = family_stats['total_time'] / family_stats['total_domains']

                print(f"\n✅ {family} completado:")
                print(f"   Runs completados: {family_stats['runs_completed']}/{runs}")
                print(f"   Total dominios procesados: {family_stats['total_domains']}")
                print(f"   Tiempo total: {family_stats['total_time']:.4f}s")
                print(f"   Tiempo promedio por dominio: {family_stats['avg_time_per_domain']:.6f}s")
                print(f"   Dominios por segundo: {family_stats['total_domains']/family_stats['total_time']:.2f}")

                # Guardar estadísticas de la familia
                stats_df = pd.DataFrame([family_stats])
                stats_df['family'] = family
                stats_file = os.path.join(results_path, f"stats_{family.replace('.gz', '')}.csv")
                stats_df.to_csv(stats_file, index=False)

        except Exception as e:
            print(f"\n❌ Error procesando familia {family}: {str(e)}")
            continue

        print(f"\n" + "="*50)

def analyze_results(results_path='/content/drive/My Drive/results/'):
    """
    Analiza los resultados guardados y genera un resumen
    """
    print("📊 Analizando resultados...")

    results_files = [f for f in os.listdir(results_path) if f.startswith('results_RandomForest_')]

    if not results_files:
        print("❌ No se encontraron archivos de resultados")
        return

    all_results = []

    for file in results_files:
        try:
            df = pd.read_csv(os.path.join(results_path, file))
            family = file.split('_')[2]  # Extraer nombre de familia
            run = file.split('_')[3].replace('.csv.gz', '')

            # Calcular métricas
            accuracy = df['correct'].mean()
            avg_time = df['query_time'].mean()

            all_results.append({
                'family': family,
                'run': run,
                'accuracy': accuracy,
                'avg_query_time': avg_time,
                'total_domains': len(df),
                'dga_domains': len(df[df['label'] == 'dga']),
                'legit_domains': len(df[df['label'] == 'notdga'])
            })

        except Exception as e:
            print(f"⚠️ Error procesando {file}: {str(e)}")

    if all_results:
        summary_df = pd.DataFrame(all_results)

        # Resumen por familia
        family_summary = summary_df.groupby('family').agg({
            'accuracy': ['mean', 'std'],
            'avg_query_time': ['mean', 'std'],
            'total_domains': 'sum'
        }).round(4)

        print("\n📈 Resumen por familia:")
        print(family_summary)

        # Guardar resumen
        summary_file = os.path.join(results_path, 'evaluation_summary.csv')
        family_summary.to_csv(summary_file)

        print(f"\n💾 Resumen guardado en: {summary_file}")

        return summary_df

    return None

# === EJEMPLO DE USO ===
if __name__ == "__main__":
    # Asume que ya tienes el modelo entrenado y los diccionarios
    # model, dictionaries = train_dga_classifier(your_training_data)

    print("🏃‍♂️ Iniciando evaluación de familias DGA...")
    print("📋 Parámetros:")
    print(f"   - Familias: 8")
    print(f"   - Runs por familia: 30")
    print(f"   - Chunk size: 50")
    print("\n" + "="*50)

    # Ejecutar evaluación
    evaluate_dga_families(model, dictionaries)

    # Analizar resultados
    results_summary = analyze_results()

    print("\n✅ Evaluación completada!")


🏃‍♂️ Iniciando evaluación de familias DGA...
📋 Parámetros:
   - Familias: 8
   - Runs por familia: 30
   - Chunk size: 50

🔍 Procesando familia: matsnu.gz

✅ matsnu.gz completado:
   Runs completados: 30/30
   Total dominios procesados: 3000
   Tiempo total: 137.7776s
   Tiempo promedio por dominio: 0.045926s
   Dominios por segundo: 21.77

🔍 Procesando familia: suppobox.gz

✅ suppobox.gz completado:
   Runs completados: 30/30
   Total dominios procesados: 3000
   Tiempo total: 138.4538s
   Tiempo promedio por dominio: 0.046151s
   Dominios por segundo: 21.67

🔍 Procesando familia: charbot.gz

✅ charbot.gz completado:
   Runs completados: 30/30
   Total dominios procesados: 3000
   Tiempo total: 138.0810s
   Tiempo promedio por dominio: 0.046027s
   Dominios por segundo: 21.73

🔍 Procesando familia: gozi.gz

✅ gozi.gz completado:
   Runs completados: 30/30
   Total dominios procesados: 3000
   Tiempo total: 138.6897s
   Tiempo promedio por dominio: 0.046230s
   Dominios por segundo: 21

In [16]:
path = f'/content/drive/My Drive/results/results_RandomForest_matsnu_20.csv.gz'
df1 = pd.read_csv(path)
df1

Unnamed: 0.1,Unnamed: 0,domain,family,subfamily,label,pred,prob_dga,prob_notdga,query_time,batch_time,run,correct
0,1049306,smokeform-camera.com,matsnu,matsnu,dga,notdga,0.169708,0.830292,0.045187,4.562825,20,0
1,1049307,duty-differ-shoulder.com,matsnu,matsnu,dga,dga,0.977685,0.022315,0.044914,4.562825,20,1
2,1049308,clerkbottle-head.com,matsnu,matsnu,dga,notdga,0.200283,0.799717,0.045159,4.562825,20,0
3,1049309,dog-black-back.com,matsnu,matsnu,dga,dga,0.976948,0.023052,0.044837,4.562825,20,1
4,1049310,key-string-project.com,matsnu,matsnu,dga,dga,0.986510,0.013490,0.045016,4.562825,20,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,3218513,airbus-carpool.com,legit,tranco,notdga,notdga,0.000518,0.999482,0.044965,4.562825,20,1
96,3218514,xn--80aa9bg.xn--p1ai,legit,tranco,notdga,notdga,0.192380,0.807620,0.046172,4.562825,20,1
97,3218515,ultraval.net,legit,tranco,notdga,notdga,0.000311,0.999689,0.044628,4.562825,20,1
98,3218516,essen-nutrition.com,legit,tranco,notdga,notdga,0.001977,0.998023,0.044798,4.562825,20,1


In [25]:
import pandas as pd
import numpy as np
import time
import os
from pathlib import Path
import gzip

def evaluate_new_dga_families(model, dictionaries,
                             new_families_path='/content/drive/My Drive/New_Families/',
                             legit_file_path='/content/drive/My Drive/Familias_Test/legit.gz',
                             results_path='/content/drive/My Drive/results/',
                             runs=30, chunk_size=50, skip_legit_chunks=30):
    """
    Evalúa nuevas familias DGA con el modelo entrenado

    Args:
        model: Modelo DGA cargado
        dictionaries: Diccionarios del modelo
        new_families_path: Ruta de las nuevas familias
        legit_file_path: Ruta del archivo de dominios legítimos
        results_path: Ruta donde guardar resultados
        runs: Número de ejecuciones por familia
        chunk_size: Tamaño del chunk
        skip_legit_chunks: Cuántos chunks de legit saltar (para continuar donde se quedó)
    """

    # Nuevas familias a evaluar
    families = [
        'bigviktor.gz',
        'pizd.gz',
        'ngioweb.gz'
    ]

    # Crear directorio de resultados si no existe
    Path(results_path).mkdir(parents=True, exist_ok=True)

    # Verificar que el archivo legit existe
    if not os.path.exists(legit_file_path):
        raise FileNotFoundError(f"Archivo legit no encontrado: {legit_file_path}")

    print(f"🆕 Evaluando nuevas familias DGA...")
    print(f"📂 Ruta familias: {new_families_path}")
    print(f"📂 Archivo legit: {legit_file_path}")
    print(f"⏭️ Saltando {skip_legit_chunks} chunks de legit")
    print(f"🎯 Runs por familia: {runs}")
    print(f"📦 Chunk size: {chunk_size}")
    print("="*60)

    # Procesar cada nueva familia
    for family in families:
        print(f"\n🔍 Procesando familia: {family}")

        family_file = os.path.join(new_families_path, family)
        if not os.path.exists(family_file):
            print(f"❌ Archivo no encontrado: {family_file}")
            continue

        # Estadísticas para la familia
        family_stats = {
            'family': family.replace('.gz', ''),
            'total_domains': 0,
            'total_time': 0,
            'avg_time_per_domain': 0,
            'runs_completed': 0,
            'runs_failed': 0,
            'total_accuracy': 0,
            'avg_accuracy': 0
        }

        try:
            # Crear lector de chunks para DGA
            dga_reader = pd.read_csv(family_file, chunksize=chunk_size)

            # Preparar lector de legit y saltar chunks especificados
            legit_reader = pd.read_csv(legit_file_path, chunksize=chunk_size)

            print(f"⏭️ Saltando {skip_legit_chunks} chunks de dominios legítimos...")
            try:
                for i in range(skip_legit_chunks):
                    next(legit_reader)
                    if (i + 1) % 10 == 0:
                        print(f"   Saltados {i + 1}/{skip_legit_chunks} chunks", end='\r')
            except StopIteration:
                print(f"\n⚠️ Solo se pudieron saltar {i} chunks de legit")
                # Reiniciar el lector si se acabaron los chunks
                legit_reader = pd.read_csv(legit_file_path, chunksize=chunk_size)

            print(f"\n✅ Iniciando evaluación de {family}")

            for run in range(runs):
                print(f" ▶️ Run {run+1}/{runs}", end="\r")

                try:
                    # Leer chunk DGA
                    dga_chunk = next(dga_reader)

                    # Leer chunk legit
                    try:
                        legit_chunk = next(legit_reader)
                    except StopIteration:
                        # Si se acabaron los chunks de legit, reiniciar el lector
                        legit_reader = pd.read_csv(legit_file_path, chunksize=chunk_size)
                        legit_chunk = next(legit_reader)

                    # Preparar datos
                    # Asegurar nombres de columnas correctos
                    if 'domain' not in dga_chunk.columns:
                        dga_chunk.columns = ['domain'] + list(dga_chunk.columns[1:])
                    if 'domain' not in legit_chunk.columns:
                        legit_chunk.columns = ['domain'] + list(legit_chunk.columns[1:])

                    # Agregar etiquetas
                    dga_chunk['label'] = 'dga'
                    legit_chunk['label'] = 'notdga'

                    # Combinar chunks
                    df_chunk = pd.concat([dga_chunk, legit_chunk]).reset_index(drop=True)

                    # Medir tiempo total para el batch
                    batch_start_time = time.time()

                    # Obtener predicciones y tiempos
                    results = classify_domains_batch(df_chunk["domain"].values, model, dictionaries)

                    batch_end_time = time.time()
                    batch_total_time = batch_end_time - batch_start_time

                    # Agregar resultados al DataFrame
                    df_chunk["pred"] = results['predictions']
                    df_chunk["prob_dga"] = results['probabilities_dga']
                    df_chunk["prob_notdga"] = results['probabilities_notdga']
                    df_chunk["query_time"] = results['processing_times']
                    df_chunk["batch_time"] = batch_total_time
                    df_chunk["run"] = run
                    df_chunk["family"] = family.replace('.gz', '')

                    # Calcular métricas
                    df_chunk["correct"] = (df_chunk["label"] == df_chunk["pred"]).astype(int)
                    run_accuracy = df_chunk["correct"].mean()

                    # Actualizar estadísticas
                    family_stats['total_domains'] += len(df_chunk)
                    family_stats['total_time'] += batch_total_time
                    family_stats['runs_completed'] += 1
                    family_stats['total_accuracy'] += run_accuracy

                    # Guardar resultados (con nombre actualizado)
                    output_file = os.path.join(
                        results_path,
                        f"results_RandomForest_NEW_{family.replace('.gz', '')}_{run}.csv.gz"
                    )

                    df_chunk.to_csv(
                        output_file,
                        index=False,
                        compression="gzip"
                    )

                except StopIteration:
                    print(f"\n⚠️ No hay más datos DGA disponibles para {family} en run {run+1}")
                    break
                except Exception as e:
                    print(f"\n❌ Error en run {run+1} para {family}: {str(e)}")
                    family_stats['runs_failed'] += 1
                    continue

            # Calcular estadísticas finales para la familia
            if family_stats['runs_completed'] > 0:
                family_stats['avg_time_per_domain'] = family_stats['total_time'] / family_stats['total_domains']
                family_stats['avg_accuracy'] = family_stats['total_accuracy'] / family_stats['runs_completed']

                print(f"\n✅ {family} completado:")
                print(f"   Runs completados: {family_stats['runs_completed']}/{runs}")
                print(f"   Runs fallidos: {family_stats['runs_failed']}")
                print(f"   Total dominios procesados: {family_stats['total_domains']}")
                print(f"   Accuracy promedio: {family_stats['avg_accuracy']:.4f}")
                print(f"   Tiempo total: {family_stats['total_time']:.4f}s")
                print(f"   Tiempo promedio por dominio: {family_stats['avg_time_per_domain']:.6f}s")
                print(f"   Dominios por segundo: {family_stats['total_domains']/family_stats['total_time']:.2f}")

                # Guardar estadísticas de la familia
                stats_df = pd.DataFrame([family_stats])
                stats_file = os.path.join(results_path, f"stats_NEW_{family.replace('.gz', '')}.csv")
                stats_df.to_csv(stats_file, index=False)

            else:
                print(f"\n❌ No se completó ningún run para {family}")

        except Exception as e:
            print(f"\n❌ Error crítico procesando familia {family}: {str(e)}")
            continue

        print(f"\n" + "="*50)

    print(f"\n🎉 Evaluación de nuevas familias completada!")
    print(f"📁 Resultados guardados en: {results_path}")

def classify_domains_batch(domains, model, dictionaries):
    """
    Clasifica múltiples dominios y mide el tiempo de procesamiento
    (Copia de la función del código anterior)
    """
    dga_dict, private_dict, english_dict, noun_dict, verb_dict, adj_dict = dictionaries

    predictions = []
    probabilities_dga = []
    probabilities_notdga = []
    processing_times = []

    for domain in domains:
        start_time = time.time()

        # Extraer características
        features = extract_features(domain, dga_dict, private_dict, english_dict,
                                  noun_dict, verb_dict, adj_dict)
        features_array = np.array([features])

        # Hacer predicción
        prediction = model.predict(features_array)[0]
        probability = model.predict_proba(features_array)[0]

        end_time = time.time()
        query_time = end_time - start_time

        predictions.append('dga' if prediction == 1 else 'notdga')
        probabilities_dga.append(probability[1])
        probabilities_notdga.append(probability[0])
        processing_times.append(query_time)

    return {
        'predictions': predictions,
        'probabilities_dga': probabilities_dga,
        'probabilities_notdga': probabilities_notdga,
        'processing_times': processing_times
    }

def analyze_new_families_results(results_path='/content/drive/My Drive/results/'):
    """
    Analiza específicamente los resultados de las nuevas familias
    """
    print("📊 Analizando resultados de nuevas familias...")

    # Buscar archivos de nuevas familias
    results_files = [f for f in os.listdir(results_path)
                    if f.startswith('results_RandomForest_NEW_')]

    if not results_files:
        print("❌ No se encontraron archivos de resultados de nuevas familias")
        return

    all_results = []

    for file in results_files:
        try:
            df = pd.read_csv(os.path.join(results_path, file))

            # Extraer información del nombre del archivo
            parts = file.replace('.csv.gz', '').split('_')
            family = parts[3]  # NEW_{family}
            run = parts[4]

            # Calcular métricas
            accuracy = df['correct'].mean()
            avg_time = df['query_time'].mean()

            # Métricas por tipo de dominio
            dga_accuracy = df[df['label'] == 'dga']['correct'].mean()
            legit_accuracy = df[df['label'] == 'notdga']['correct'].mean()

            all_results.append({
                'family': family,
                'run': run,
                'accuracy': accuracy,
                'dga_accuracy': dga_accuracy,
                'legit_accuracy': legit_accuracy,
                'avg_query_time': avg_time,
                'total_domains': len(df),
                'dga_domains': len(df[df['label'] == 'dga']),
                'legit_domains': len(df[df['label'] == 'notdga'])
            })

        except Exception as e:
            print(f"⚠️ Error procesando {file}: {str(e)}")

    if all_results:
        summary_df = pd.DataFrame(all_results)

        # Resumen por familia
        family_summary = summary_df.groupby('family').agg({
            'accuracy': ['mean', 'std', 'min', 'max'],
            'dga_accuracy': ['mean', 'std'],
            'legit_accuracy': ['mean', 'std'],
            'avg_query_time': ['mean', 'std'],
            'total_domains': 'sum'
        }).round(4)

        print("\n📈 Resumen de nuevas familias:")
        print(family_summary)

        # Comparación con familias originales (si hay datos)
        print(f"\n🆕 Resultados por familia nueva:")
        for family in summary_df['family'].unique():
            family_data = summary_df[summary_df['family'] == family]
            print(f"\n{family.upper()}:")
            print(f"  📊 Accuracy promedio: {family_data['accuracy'].mean():.4f} ±{family_data['accuracy'].std():.4f}")
            print(f"  🎯 DGA detection: {family_data['dga_accuracy'].mean():.4f}")
            print(f"  ✅ Legit detection: {family_data['legit_accuracy'].mean():.4f}")
            print(f"  ⏱️ Tiempo promedio: {family_data['avg_query_time'].mean():.6f}s")
            print(f"  📝 Runs completados: {len(family_data)}")

        # Guardar resumen
        summary_file = os.path.join(results_path, 'new_families_summary.csv')
        family_summary.to_csv(summary_file)

        detailed_file = os.path.join(results_path, 'new_families_detailed.csv')
        summary_df.to_csv(detailed_file, index=False)

        print(f"\n💾 Resúmenes guardados:")
        print(f"  - {summary_file}")
        print(f"  - {detailed_file}")

        return summary_df

    return None

# === EJEMPLO DE USO ===
if __name__ == "__main__":
    print("🆕 Evaluación de Nuevas Familias DGA")
    print("="*50)

    # Cargar el modelo previamente entrenado
    print("📥 Cargando modelo DGA...")
    try:
        model, dictionaries, metadata = load_dga_model()
        print("✅ Modelo cargado exitosamente")

        # Ejecutar evaluación de nuevas familias
        print("\n🚀 Iniciando evaluación de nuevas familias...")
        evaluate_new_dga_families(
            model=model,
            dictionaries=dictionaries,
            new_families_path='/content/drive/My Drive/New_Families/',
            legit_file_path='/content/drive/My Drive/Familias_Test/legit.gz',
            results_path='/content/drive/My Drive/results/',
            runs=30,
            chunk_size=50,
            skip_legit_chunks=30
        )

        # Analizar resultados
        print("\n📊 Analizando resultados...")
        results_summary = analyze_new_families_results()

    except Exception as e:
        print(f"❌ Error: {str(e)}")
        print("💡 Asegúrate de haber entrenado y guardado el modelo primero")

🆕 Evaluación de Nuevas Familias DGA
📥 Cargando modelo DGA...
📥 Cargando modelo...
✅ Modelo cargado exitosamente
📥 Cargando diccionarios...
✅ Diccionarios cargados exitosamente
📥 Cargando metadatos...
✅ Metadatos cargados exitosamente

📊 Información del modelo:
  Tipo: RandomForestClassifier
  N° de árboles: 100
  N° de características: 16
  Tamaños de diccionarios:
    - english_dict: 234,351 palabras
    - noun_dict: 100 palabras
    - verb_dict: 106 palabras
    - adj_dict: 110 palabras
    - dga_dict: 77,834 palabras
    - private_dict: 74,121 palabras
✅ Modelo cargado exitosamente

🚀 Iniciando evaluación de nuevas familias...
🆕 Evaluando nuevas familias DGA...
📂 Ruta familias: /content/drive/My Drive/New_Families/
📂 Archivo legit: /content/drive/My Drive/Familias_Test/legit.gz
⏭️ Saltando 30 chunks de legit
🎯 Runs por familia: 30
📦 Chunk size: 50

🔍 Procesando familia: bigviktor.gz
⏭️ Saltando 30 chunks de dominios legítimos...
   Saltados 30/30 chunks
✅ Iniciando evaluación de big

In [24]:
import pickle
import joblib
import os
from pathlib import Path
import pandas as pd
import numpy as np

def save_dga_model(model, dictionaries, model_path='/content/drive/My Drive/models/'):
    """
    Guarda el modelo DGA entrenado y todos sus diccionarios

    Args:
        model: Modelo RandomForest entrenado
        dictionaries: Tupla con todos los diccionarios necesarios
        model_path: Ruta donde guardar el modelo
    """
    # Crear directorio si no existe
    Path(model_path).mkdir(parents=True, exist_ok=True)

    # Guardar el modelo
    model_file = os.path.join(model_path, 'dga_random_forest_model.joblib')
    joblib.dump(model, model_file)
    print(f"✅ Modelo guardado en: {model_file}")

    # Guardar los diccionarios
    dictionaries_file = os.path.join(model_path, 'dga_dictionaries.pkl')
    with open(dictionaries_file, 'wb') as f:
        pickle.dump(dictionaries, f)
    print(f"✅ Diccionarios guardados en: {dictionaries_file}")

    # Guardar metadatos del modelo
    metadata = {
        'model_type': 'RandomForestClassifier',
        'n_estimators': model.n_estimators,
        'max_depth': model.max_depth,
        'min_samples_split': model.min_samples_split,
        'min_samples_leaf': model.min_samples_leaf,
        'random_state': model.random_state,
        'n_features': model.n_features_in_,
        'feature_names': [
            'domain_len', 'ascii_sum', 'vowel_count', 'vowel_dist',
            'digit_dash_count', 'digit_dash_dist', 'word_norm', 'word_dga',
            'noun_count', 'verb_count', 'adj_count', 'private_count',
            'ratio_dga_norm', 'max_len_word', 'min_len_word', 'word_char_ratio'
        ],
        'dictionary_sizes': {
            'english_dict': len(dictionaries[2]),
            'noun_dict': len(dictionaries[3]),
            'verb_dict': len(dictionaries[4]),
            'adj_dict': len(dictionaries[5]),
            'dga_dict': len(dictionaries[0]),
            'private_dict': len(dictionaries[1])
        }
    }

    metadata_file = os.path.join(model_path, 'model_metadata.pkl')
    with open(metadata_file, 'wb') as f:
        pickle.dump(metadata, f)
    print(f"✅ Metadatos guardados en: {metadata_file}")

    print(f"\n📦 Modelo completo guardado en: {model_path}")
    print("Archivos creados:")
    print(f"  - {os.path.basename(model_file)}")
    print(f"  - {os.path.basename(dictionaries_file)}")
    print(f"  - {os.path.basename(metadata_file)}")

def load_dga_model(model_path='/content/drive/My Drive/models/'):
    """
    Carga el modelo DGA y sus diccionarios

    Args:
        model_path: Ruta donde está guardado el modelo

    Returns:
        tuple: (model, dictionaries, metadata)
    """
    # Verificar que los archivos existen
    model_file = os.path.join(model_path, 'dga_random_forest_model.joblib')
    dictionaries_file = os.path.join(model_path, 'dga_dictionaries.pkl')
    metadata_file = os.path.join(model_path, 'model_metadata.pkl')

    if not os.path.exists(model_file):
        raise FileNotFoundError(f"Modelo no encontrado: {model_file}")
    if not os.path.exists(dictionaries_file):
        raise FileNotFoundError(f"Diccionarios no encontrados: {dictionaries_file}")
    if not os.path.exists(metadata_file):
        raise FileNotFoundError(f"Metadatos no encontrados: {metadata_file}")

    # Cargar el modelo
    print("📥 Cargando modelo...")
    model = joblib.load(model_file)
    print("✅ Modelo cargado exitosamente")

    # Cargar los diccionarios
    print("📥 Cargando diccionarios...")
    with open(dictionaries_file, 'rb') as f:
        dictionaries = pickle.load(f)
    print("✅ Diccionarios cargados exitosamente")

    # Cargar metadatos
    print("📥 Cargando metadatos...")
    with open(metadata_file, 'rb') as f:
        metadata = pickle.load(f)
    print("✅ Metadatos cargados exitosamente")

    # Mostrar información del modelo
    print(f"\n📊 Información del modelo:")
    print(f"  Tipo: {metadata['model_type']}")
    print(f"  N° de árboles: {metadata['n_estimators']}")
    print(f"  N° de características: {metadata['n_features']}")
    print(f"  Tamaños de diccionarios:")
    for dict_name, size in metadata['dictionary_sizes'].items():
        print(f"    - {dict_name}: {size:,} palabras")

    return model, dictionaries, metadata

def test_loaded_model(model, dictionaries, test_domains=None):
    """
    Prueba el modelo cargado con algunos dominios de ejemplo

    Args:
        model: Modelo cargado
        dictionaries: Diccionarios cargados
        test_domains: Lista de dominios para probar (opcional)
    """
    if test_domains is None:
        test_domains = [
            'google.com',
            'facebook.com',
            'xkvbpqr.com',
            'mnbvcxz.net',
            'microsoft.com',
            'randomstring123.org'
        ]

    print("\n🧪 Probando modelo con dominios de ejemplo:")
    print("-" * 60)

    for domain in test_domains:
        result = classify_domain(domain, model, dictionaries)
        print(f"{domain:20} -> {result['prediction']:8} (prob: {result['dga_probability']:.3f})")

# === EJEMPLO DE USO COMPLETO ===

def complete_training_and_saving_example():
    """
    Ejemplo completo de entrenamiento, guardado y carga del modelo
    """
    print("🚀 Ejemplo completo: Entrenar, Guardar y Cargar modelo DGA")
    print("=" * 60)

    # Paso 1: Crear datos de ejemplo (reemplaza con tus datos reales)
    sample_data = {
        'domain': [
            # Dominios legítimos
            'google.com', 'facebook.com', 'microsoft.com', 'amazon.com',
            'youtube.com', 'wikipedia.org', 'twitter.com', 'instagram.com',
            'linkedin.com', 'github.com', 'stackoverflow.com', 'reddit.com',

            # Dominios DGA simulados
            'xkvbpqr.com', 'mnbvcxz.net', 'qwertyuiop.org', 'asdfghjkl.info',
            'randomstring123.com', 'anotherfakedom.net', 'abcdef.xyz',
            'ksdjfhskjh.com', 'pqowieuryt.net', 'zxcvbnm.org', 'hjklqwer.com'
        ],
        'label': (
            ['notdga'] * 12 +  # 12 legítimos
            ['dga'] * 11       # 11 DGA
        )
    }

    df_example = pd.DataFrame(sample_data)
    print(f"📊 Dataset de ejemplo: {len(df_example)} dominios")
    print(f"  - Legítimos: {(df_example['label'] == 'notdga').sum()}")
    print(f"  - DGA: {(df_example['label'] == 'dga').sum()}")

    # Paso 2: Entrenar el modelo
    print("\n🏋️ Entrenando modelo...")
    model, dictionaries = train_dga_model(df_example)

    # Paso 3: Guardar el modelo
    print("\n💾 Guardando modelo...")
    save_dga_model(model, dictionaries)

    # Paso 4: Simular reinicio del programa (cargar modelo)
    print("\n🔄 Simulando carga del modelo...")
    loaded_model, loaded_dictionaries, metadata = load_dga_model()

    # Paso 5: Probar el modelo cargado
    test_loaded_model(loaded_model, loaded_dictionaries)

    print("\n✅ Ejemplo completado exitosamente!")
    return loaded_model, loaded_dictionaries

# === FUNCIONES AUXILIARES PARA LA CARGA ===

def quick_load_and_test():
    """
    Función rápida para cargar y probar el modelo guardado
    """
    try:
        print("⚡ Carga rápida del modelo DGA...")
        model, dictionaries, metadata = load_dga_model()

        # Probar con algunos dominios
        test_domains = [
            'suspicious-domain.com',
            'google.com',
            'qwerty123.net',
            'microsoft.com'
        ]

        test_loaded_model(model, dictionaries, test_domains)

        return model, dictionaries

    except Exception as e:
        print(f"❌ Error cargando el modelo: {str(e)}")
        return None, None

def verify_model_files(model_path='/content/drive/My Drive/models/'):
    """
    Verifica que todos los archivos del modelo existen
    """
    required_files = [
        'dga_random_forest_model.joblib',
        'dga_dictionaries.pkl',
        'model_metadata.pkl'
    ]

    print(f"🔍 Verificando archivos del modelo en: {model_path}")

    all_present = True
    for file in required_files:
        file_path = os.path.join(model_path, file)
        if os.path.exists(file_path):
            size = os.path.getsize(file_path)
            print(f"  ✅ {file} ({size:,} bytes)")
        else:
            print(f"  ❌ {file} - NO ENCONTRADO")
            all_present = False

    return all_present

if __name__ == "__main__":
    # Verificar si el modelo ya existe
    save_dga_model(model, dictionaries, model_path='/content/drive/My Drive/models/')
    if verify_model_files():
        print("📂 Modelo encontrado. Cargando...")
        model, dictionaries = quick_load_and_test()
    else:
        print("📂 Modelo no encontrado. Ejecutar entrenamiento completo...")
        # complete_training_and_saving_example()

✅ Modelo guardado en: /content/drive/My Drive/models/dga_random_forest_model.joblib
✅ Diccionarios guardados en: /content/drive/My Drive/models/dga_dictionaries.pkl
✅ Metadatos guardados en: /content/drive/My Drive/models/model_metadata.pkl

📦 Modelo completo guardado en: /content/drive/My Drive/models/
Archivos creados:
  - dga_random_forest_model.joblib
  - dga_dictionaries.pkl
  - model_metadata.pkl
🔍 Verificando archivos del modelo en: /content/drive/My Drive/models/
  ✅ dga_random_forest_model.joblib (4,793,609 bytes)
  ✅ dga_dictionaries.pkl (4,560,688 bytes)
  ✅ model_metadata.pkl (531 bytes)
📂 Modelo encontrado. Cargando...
⚡ Carga rápida del modelo DGA...
📥 Cargando modelo...
✅ Modelo cargado exitosamente
📥 Cargando diccionarios...
✅ Diccionarios cargados exitosamente
📥 Cargando metadatos...
✅ Metadatos cargados exitosamente

📊 Información del modelo:
  Tipo: RandomForestClassifier
  N° de árboles: 100
  N° de características: 16
  Tamaños de diccionarios:
    - english_dict: 

In [31]:
runs=30
families = [
    'matsnu.gz',
    'suppobox.gz',
    'charbot.gz',
    'gozi.gz',
    'manuelita.gz',
    'rovnix.gz',
    'deception.gz',
    'nymaim.gz',
    'bigviktor.gz',
    'pizd.gz',
    'ngioweb.gz'
]

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
import numpy as np
import pandas as pd

def fpr_tpr(y, ypred):
    tn, fp, fn, tp = confusion_matrix(y, ypred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    return fpr, tpr

# Listas para métricas globales
all_acc, all_pre, all_rec, all_f1 = [], [], [], []
all_fpr, all_tpr, all_qt, all_qts = [], [], [], []
total_unknowns_global = 0


for family in families:
    acc = []
    pre = []
    rec = []
    f1 = []
    fpr = []
    tpr = []
    qt = []
    qts = []
    total_unknowns = 0
    for run in range(runs):
        path = f'/content/drive/My Drive/results/results_RandomForest_{family}_{run}.csv.gz'
        df = pd.read_csv(path)
        #print(df)
        y_true = (df["label"] == 'dga').astype(int)
        y_pred = (df["pred"] == 'dga').astype(int)
        #y_pred = df["pred"]

                # Métricas
        acc.append(accuracy_score(y_true, y_pred))
        pre.append(precision_score(y_true, y_pred, zero_division=0))
        rec.append(recall_score(y_true, y_pred, zero_division=0))
        f1.append(f1_score(y_true, y_pred, zero_division=0))
        fpr_val, tpr_val = fpr_tpr(y_true, y_pred)
        fpr.append(fpr_val)
        tpr.append(tpr_val)

        if 'query_time' in df.columns:
            qt.append(df['query_time'].mean())
            qts.append(df['query_time'].std())

    # Promedios por familia
    if acc:  # solo si hubo archivos válidos
        print(f'{family.split(".")[0]:15}: '
              f'acc:{np.mean(acc):.2f}±{np.std(acc):.3f} '
              f'f1:{np.mean(f1):.2f}±{np.std(f1):.3f} '
              f'pre:{np.mean(pre):.2f}±{np.std(pre):.3f} '
              f'rec:{np.mean(rec):.2f}±{np.std(rec):.3f} '
              f'FPR:{np.mean(fpr):.2f}±{np.std(fpr):.3f} '
              f'TPR:{np.mean(tpr):.2f}±{np.std(tpr):.3f} '
              f'QT:{np.mean(qt):.5f}±{np.std(qt):.5f} '
              f'Unknowns: {total_unknowns}')

        all_acc.append(np.mean(acc))
        all_pre.append(np.mean(pre))
        all_rec.append(np.mean(rec))
        all_f1.append(np.mean(f1))
        all_fpr.append(np.mean(fpr))
        all_tpr.append(np.mean(tpr))
        all_qt.append(np.mean(qt))
        all_qts.append(np.mean(qts))
        total_unknowns_global += total_unknowns

# 🔍 Métricas globales
print("\n### 📊 Métricas globales ###")
print(f'Accuracy   : {np.mean(all_acc):.2f}')
print(f'F1-Score   : {np.mean(all_f1):.2f}')
print(f'Precision  : {np.mean(all_pre):.2f}')
print(f'Recall     : {np.mean(all_rec):.2f}')
print(f'FPR        : {np.mean(all_fpr):.2f}')
print(f'TPR        : {np.mean(all_tpr):.2f}')
print(f'Query time : {np.mean(all_qt):.5f} ± {np.mean(all_qts):.5f}')
print(f'Total unknown classifications: {total_unknowns_global}')


matsnu         : acc:0.76±0.032 f1:0.70±0.050 pre:0.91±0.009 rec:0.58±0.064 FPR:0.06±0.000 TPR:0.58±0.064 QT:0.04592±0.00043 Unknowns: 0
suppobox       : acc:0.52±0.027 f1:0.16±0.086 pre:0.54±0.202 rec:0.09±0.055 FPR:0.06±0.000 TPR:0.09±0.055 QT:0.04615±0.00155 Unknowns: 0
charbot        : acc:0.47±0.002 f1:0.00±0.009 pre:0.02±0.062 rec:0.00±0.005 FPR:0.06±0.000 TPR:0.00±0.005 QT:0.04602±0.00059 Unknowns: 0
gozi           : acc:0.47±0.000 f1:0.00±0.000 pre:0.00±0.000 rec:0.00±0.000 FPR:0.06±0.000 TPR:0.00±0.000 QT:0.04623±0.00100 Unknowns: 0
manuelita      : acc:0.48±0.007 f1:0.04±0.026 pre:0.26±0.136 rec:0.02±0.015 FPR:0.06±0.000 TPR:0.02±0.015 QT:0.04633±0.00190 Unknowns: 0
rovnix         : acc:0.47±0.000 f1:0.00±0.000 pre:0.00±0.000 rec:0.00±0.000 FPR:0.06±0.000 TPR:0.00±0.000 QT:0.04659±0.00258 Unknowns: 0
deception      : acc:0.92±0.020 f1:0.92±0.022 pre:0.94±0.003 rec:0.90±0.039 FPR:0.06±0.000 TPR:0.90±0.039 QT:0.04746±0.00439 Unknowns: 0
nymaim         : acc:0.55±0.025 f1:0.26±0

In [30]:
import os
import glob

def rename_family_files():
    # Mapeo de nombres actuales a nombres nuevos
    family_mapping = {
        'matsnu': 'matsnu.gz',
        'suppobox': 'suppobox.gz',
        'charbot': 'charbot.gz',
        'gozi': 'gozi.gz',
        'manuelita': 'manuelita.gz',
        'rovnix': 'rovnix.gz',
        'deception': 'deception.gz',
        'nymaim': 'nymaim.gz',
        'NEW_bigviktor': 'bigviktor.gz',
        'NEW_pizd': 'pizd.gz',
        'NEW_ngioweb': 'ngioweb.gz'
    }

    # Cambiar al directorio results
    results_dir = '/content/drive/My Drive/results'

    if not os.path.exists(results_dir):
        print(f"Error: La carpeta '{results_dir}' no existe")
        return

    os.chdir(results_dir)

    # Contadores para estadísticas
    renamed_count = 0
    not_found_count = 0

    print("Iniciando renombrado de archivos...")
    print("-" * 50)

    # Procesar cada familia
    for old_family, new_family in family_mapping.items():
        # Buscar archivos que contengan el nombre de la familia actual
        pattern = f"*{old_family}*.csv.gz"
        matching_files = glob.glob(pattern)

        print(f"\nProcesando familia: {old_family} -> {new_family}")
        print(f"Archivos encontrados: {len(matching_files)}")

        if not matching_files:
            print(f"  ⚠️  No se encontraron archivos para la familia '{old_family}'")
            not_found_count += 1
            continue

        # Renombrar cada archivo encontrado
        for old_filename in matching_files:
            # Reemplazar el nombre de la familia en el nombre del archivo
            new_filename = old_filename.replace(old_family, new_family)

            try:
                os.rename(old_filename, new_filename)
                print(f"  ✅ {old_filename} -> {new_filename}")
                renamed_count += 1
            except Exception as e:
                print(f"  ❌ Error renombrando {old_filename}: {e}")

    # Mostrar estadísticas finales
    print("\n" + "=" * 50)
    print("RESUMEN FINAL:")
    print(f"Archivos renombrados exitosamente: {renamed_count}")
    print(f"Familias sin archivos encontrados: {not_found_count}")
    print("Proceso completado.")

    # Volver al directorio original
    os.chdir('..')

# Función alternativa que también muestra una vista previa antes de renombrar
def preview_rename_family_files():
    """Versión que muestra qué cambios se harán antes de ejecutarlos"""

    family_mapping = {
        'matsnu': 'matsnu.gz',
        'suppobox': 'suppobox.gz',
        'charbot': 'charbot.gz',
        'gozi': 'gozi.gz',
        'manuelita': 'manuelita.gz',
        'rovnix': 'rovnix.gz',
        'deception': 'deception.gz',
        'nymaim': 'nymaim.gz',
        'bigviktor': 'bigviktor.gz',
        'pizd': 'pizd.gz',
        'ngioweb': 'ngioweb.gz'
    }

    results_dir = 'results'

    if not os.path.exists(results_dir):
        print(f"Error: La carpeta '{results_dir}' no existe")
        return

    os.chdir(results_dir)

    print("VISTA PREVIA DE CAMBIOS:")
    print("=" * 60)

    changes_to_make = []

    # Mostrar vista previa
    for old_family, new_family in family_mapping.items():
        pattern = f"*{old_family}*.csv.gz"
        matching_files = glob.glob(pattern)

        if matching_files:
            print(f"\nFamilia: {old_family} -> {new_family} ({len(matching_files)} archivos)")
            for old_filename in matching_files:
                new_filename = old_filename.replace(old_family, new_family)
                print(f"  {old_filename} -> {new_filename}")
                changes_to_make.append((old_filename, new_filename))

    if not changes_to_make:
        print("No se encontraron archivos para renombrar.")
        os.chdir('..')
        return

    # Confirmar cambios
    print(f"\n¿Proceder con el renombrado de {len(changes_to_make)} archivos? (s/n): ", end="")
    confirm = input().lower().strip()

    if confirm in ['s', 'si', 'sí', 'y', 'yes']:
        print("\nEjecutando cambios...")
        renamed_count = 0

        for old_filename, new_filename in changes_to_make:
            try:
                os.rename(old_filename, new_filename)
                print(f"  ✅ {old_filename} -> {new_filename}")
                renamed_count += 1
            except Exception as e:
                print(f"  ❌ Error: {e}")

        print(f"\nCompletado: {renamed_count} archivos renombrados.")
    else:
        print("Operación cancelada.")

    os.chdir('..')

if __name__ == "__main__":
    # Descomenta la función que quieras usar:

    # Opción 1: Renombrar directamente
    rename_family_files()

    # Opción 2: Mostrar vista previa y confirmar (recomendado)
    # preview_rename_family_files()

Iniciando renombrado de archivos...
--------------------------------------------------

Procesando familia: matsnu -> matsnu.gz
Archivos encontrados: 30
  ✅ results_RandomForest_matsnu_0.csv.gz -> results_RandomForest_matsnu.gz_0.csv.gz
  ✅ results_RandomForest_matsnu_1.csv.gz -> results_RandomForest_matsnu.gz_1.csv.gz
  ✅ results_RandomForest_matsnu_2.csv.gz -> results_RandomForest_matsnu.gz_2.csv.gz
  ✅ results_RandomForest_matsnu_3.csv.gz -> results_RandomForest_matsnu.gz_3.csv.gz
  ✅ results_RandomForest_matsnu_4.csv.gz -> results_RandomForest_matsnu.gz_4.csv.gz
  ✅ results_RandomForest_matsnu_5.csv.gz -> results_RandomForest_matsnu.gz_5.csv.gz
  ✅ results_RandomForest_matsnu_6.csv.gz -> results_RandomForest_matsnu.gz_6.csv.gz
  ✅ results_RandomForest_matsnu_7.csv.gz -> results_RandomForest_matsnu.gz_7.csv.gz
  ✅ results_RandomForest_matsnu_8.csv.gz -> results_RandomForest_matsnu.gz_8.csv.gz
  ✅ results_RandomForest_matsnu_9.csv.gz -> results_RandomForest_matsnu.gz_9.csv.gz
  ✅ res