In [None]:
from google.colab import drive  # Importing the library to mount Google Drive
drive.mount('/content/drive')  # Mounting Google Drive in Colab environment

Mounted at /content/drive


In [None]:
%%capture
!pip install keras_self_attention

In [None]:
import pandas as pd

# File paths
train_df_file = "/content/drive/My Drive/MOE_DGA/train_wl.csv"

train_df = pd.read_csv(train_df_file)

#train_df = train_df.rename(columns={"label": "Label"})


print(train_df)

                       domain    family   label
0         nailconsiderable.ru  suppobox     dga
1            stilldelight.net  suppobox     dga
2       kimberleekatheryn.net  suppobox     dga
3                soilbeen.net  suppobox     dga
4               visitform.net  suppobox     dga
...                       ...       ...     ...
159995             dhuhaa.com     legit  notdga
159996        sdmetalcrew.org     legit  notdga
159997  melbcampcontuligol.ga     legit  notdga
159998      pl-enthusiast.net     legit  notdga
159999            rd-forum.ru     legit  notdga

[160000 rows x 3 columns]


In [None]:
import datetime
import numpy as np
import pandas as pd

from keras.callbacks import ModelCheckpoint, History
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, Dropout, Embedding
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention

## Charset and encoding/decoding functions
def encode(domain):
    # Convertir a minúsculas y filtrar caracteres no válidos
    domain = domain.lower()
    encoded = []
    for d in domain:
        if d in stoi:
            encoded.append(stoi[d])
        else:
            # Si el carácter no está en el charset, usar '*' como carácter desconocido
            encoded.append(stoi['*'])
    return encoded

def pad(l, amount=0, where='right', value=0):
    llen = len(l)
    if where == 'left':
        padded = [value]*(amount - llen) + l[:amount]
    if where == 'right':
        padded = l[:amount] + [value]*(amount - llen)
    return padded

# Charset expandido: incluye números, letras minúsculas, y caracteres comunes en dominios
charset = ['*'] + [chr(x) for x in range(0x30, 0x30+10)] + [chr(x) for x in range(0x61, 0x61+26)] + ['-', '_' ,'.']
stoi = {k:charset.index(k) for k in charset}
itos = {charset.index(k):k for k in charset}

print(f"Charset disponible: {''.join(charset)}")
print(f"Tamaño del vocabulario: {len(charset)}")

## Main parameters of the model
vocab_size = len(charset)
batch_size = 64
max_len = 64  # Maximum length for the domain names
embd_size = 128
lstm_size = 128
dense_size = 64
dropout = 0.5

## Data preparation function
def prepare_data(train_df):
    """
    Prepara los datos del dataframe para el entrenamiento
    train_df debe tener columnas 'domain' y 'label' (con valores 'dga' y 'notdga')
    """
    # Crear etiquetas binarias (1 para dga, 0 para notdga)
    df = train_df.copy()
    df['y'] = (df.label == 'dga').astype(int)

    # Codificar dominios
    df['encoded'] = df.domain.apply(encode)
    df['padded'] = df.encoded.apply(lambda x: pad(x, max_len, 'left'))

    # Convertir a arrays numpy
    X = np.array(list(df.padded.values))
    y = df['y'].values

    return X, y

## Callbacks para guardar el modelo y su historial de entrenamiento
def build_callbacks(save_path, monitor):
    checkpoint = ModelCheckpoint(filepath=save_path, monitor=monitor, verbose=1, save_best_only=True)
    history = History()
    callbacks = [checkpoint, history]
    return callbacks

# Crear callbacks
timestamp = str(datetime.datetime.now()).split(".")[0].replace(" ", "_")
labin_callbacks = build_callbacks(f'LABin_best_model_{timestamp}.keras', 'val_loss')

## LABin model definition - Binary classifier
LABin = Sequential()
LABin.add(Embedding(input_dim=vocab_size, output_dim=embd_size, input_length=max_len))
LABin.add(Bidirectional(LSTM(lstm_size, return_sequences=True), name="bilstm1"))
LABin.add(SeqSelfAttention(name="seqselfatt"))
LABin.add(Dropout(rate=dropout, name="drop1"))
LABin.add(Bidirectional(LSTM(lstm_size, return_sequences=True), name="bilstm2"))
LABin.add(SeqWeightedAttention(name="seqweigatt"))
LABin.add(Dropout(rate=dropout, name="drop2"))
LABin.add(Dense(dense_size, activation='relu', name="linear"))
LABin.add(Dropout(rate=dropout, name="drop3"))
LABin.add(Dense(1, activation='sigmoid', name="sigmoid"))
LABin.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])

# Mostrar resumen del modelo
LABin.summary()

## Función de entrenamiento
def train_labin(train_df, epochs=50, validation_split=0.2):
    """
    Entrena el modelo LABin con el dataframe proporcionado
    """
    print("Preparando datos...")
    X, y = prepare_data(train_df)

    print(f"Datos preparados: {X.shape[0]} muestras")
    print(f"Distribución de clases: DGA={np.sum(y)}, NotDGA={len(y)-np.sum(y)}")

    print("Iniciando entrenamiento...")
    history = LABin.fit(
        X, y,
        batch_size=batch_size,
        epochs=epochs,
        callbacks=labin_callbacks,
        validation_split=validation_split,
        verbose=1
    )

    return history

# Ejemplo de uso:
# Asumiendo que tienes tu dataframe 'train_df' con columnas 'domain' y 'label'
# history = train_labin(train_df, epochs=50)

## Función para visualizar resultados (opcional)
def plot_training_history(history):
    import matplotlib.pyplot as plt

    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('LABin Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('LABin Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.savefig(f'LABin_training_history_{timestamp}.png')
    plt.show()

# Para usar después del entrenamiento:
# plot_training_history(history)

Charset disponible: *0123456789abcdefghijklmnopqrstuvwxyz-_.
Tamaño del vocabulario: 40




In [None]:
# Ejemplo de uso:
# Asumiendo que tienes tu dataframe 'train_df' con columnas 'domain' y 'label'
history = train_labin(train_df, epochs=50)


Preparando datos...
Datos preparados: 160000 muestras
Distribución de clases: DGA=80000, NotDGA=80000
Iniciando entrenamiento...
Epoch 1/50
[1m1998/2000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.7264 - loss: 0.5377
Epoch 1: val_loss improved from inf to 0.59806, saving model to LABin_best_model_2025-05-30_15:26:47.keras
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 25ms/step - accuracy: 0.7264 - loss: 0.5376 - val_accuracy: 0.7864 - val_loss: 0.5981
Epoch 2/50
[1m1999/2000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.7948 - loss: 0.4331
Epoch 2: val_loss improved from 0.59806 to 0.53677, saving model to LABin_best_model_2025-05-30_15:26:47.keras
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 24ms/step - accuracy: 0.7948 - loss: 0.4331 - val_accuracy: 0.7870 - val_loss: 0.5368
Epoch 3/50
[1m1998/2000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accura

In [None]:
## FUNCIONES PARA CARGAR EL MODELO Y HACER PREDICCIONES

def load_trained_model(model_path):
    """
    Carga el modelo entrenado desde un archivo
    """
    from keras.models import load_model
    from keras_self_attention import SeqSelfAttention, SeqWeightedAttention

    # Cargar el modelo con las capas personalizadas
    custom_objects = {
        'SeqSelfAttention': SeqSelfAttention,
        'SeqWeightedAttention': SeqWeightedAttention
    }

    model = load_model(model_path, custom_objects=custom_objects)
    print(f"Modelo cargado desde: {model_path}")
    return model

def predict_single_domain(model, domain):
    """
    Predice si un dominio individual es DGA o no
    """
    # Preparar el dominio
    encoded = encode(domain)
    padded = pad(encoded, max_len, 'left')
    X = np.array([padded])  # Agregar dimensión batch

    # Hacer predicción
    prediction = model.predict(X, verbose=0)[0][0]

    # Interpretar resultado
    is_dga = prediction > 0.5
    confidence = prediction if is_dga else (1 - prediction)

    result = {
        'domain': domain,
        'prediction': 'DGA' if is_dga else 'LEGIT',
        'confidence': confidence,
        'raw_score': prediction
    }

    return result

def predict_domains_batch(model, domains_list):
    """
    Predice múltiples dominios a la vez
    """
    results = []

    # Preparar todos los dominios
    encoded_domains = [pad(encode(domain), max_len, 'left') for domain in domains_list]
    X = np.array(encoded_domains)

    # Hacer predicciones en lote
    predictions = model.predict(X, verbose=0)

    # Procesar resultados
    for i, domain in enumerate(domains_list):
        pred_score = predictions[i][0]
        is_dga = pred_score > 0.5
        confidence = pred_score if is_dga else (1 - pred_score)

        result = {
            'domain': domain,
            'prediction': 'DGA' if is_dga else 'LEGIT',
            'confidence': confidence,
            'raw_score': pred_score
        }
        results.append(result)

    return results

def evaluate_model_on_test(model, test_df):
    """
    Evalúa el modelo en un conjunto de test
    test_df debe tener columnas 'domain' y 'label'
    """
    print("Evaluando modelo en datos de test...")

    # Preparar datos de test
    X_test, y_test = prepare_data(test_df)

    # Hacer predicciones
    predictions = model.predict(X_test, verbose=0)
    y_pred = (predictions > 0.5).astype(int).flatten()

    # Calcular métricas
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm
    }

## EJEMPLOS DE USO:

"""
# 1. ENTRENAR EL MODELO
history = train_labin(train_df, epochs=50)

# 2. CARGAR UN MODELO YA ENTRENADO
# Cambia 'ruta_del_modelo.keras' por la ruta real donde guardaste tu modelo
loaded_model = load_trained_model('LABin_best_model_2025-05-30_15:22:09.keras')

# 3. PROBAR UN DOMINIO INDIVIDUAL
result = predict_single_domain(loaded_model, 'google.com')
print(f"Dominio: {result['domain']}")
print(f"Predicción: {result['prediction']}")
print(f"Confianza: {result['confidence']:.4f}")

# 4. PROBAR MÚLTIPLES DOMINIOS
test_domains = [
    'google.com',
    'facebook.com',
    'xkjhsdkjfhlksdjf.com',
    'qwerty123456.net',
    'amazon.com'
]

results = predict_domains_batch(loaded_model, test_domains)
for result in results:
    print(f"{result['domain']:<30} -> {result['prediction']:<5} (confianza: {result['confidence']:.4f})")

# 5. EVALUAR EN CONJUNTO DE TEST (si tienes un test_df)
# metrics = evaluate_model_on_test(loaded_model, test_df)
"""

'\n# 1. ENTRENAR EL MODELO\nhistory = train_labin(train_df, epochs=50)\n\n# 2. CARGAR UN MODELO YA ENTRENADO\n# Cambia \'ruta_del_modelo.keras\' por la ruta real donde guardaste tu modelo\nloaded_model = load_trained_model(\'LABin_best_model_2025-05-30_15:22:09.keras\')\n\n# 3. PROBAR UN DOMINIO INDIVIDUAL\nresult = predict_single_domain(loaded_model, \'google.com\')\nprint(f"Dominio: {result[\'domain\']}")\nprint(f"Predicción: {result[\'prediction\']}")\nprint(f"Confianza: {result[\'confidence\']:.4f}")\n\n# 4. PROBAR MÚLTIPLES DOMINIOS\ntest_domains = [\n    \'google.com\',\n    \'facebook.com\', \n    \'xkjhsdkjfhlksdjf.com\',\n    \'qwerty123456.net\',\n    \'amazon.com\'\n]\n\nresults = predict_domains_batch(loaded_model, test_domains)\nfor result in results:\n    print(f"{result[\'domain\']:<30} -> {result[\'prediction\']:<5} (confianza: {result[\'confidence\']:.4f})")\n\n# 5. EVALUAR EN CONJUNTO DE TEST (si tienes un test_df)\n# metrics = evaluate_model_on_test(loaded_model, tes

In [17]:
# 2. CARGAR UN MODELO YA ENTRENADO
# Cambia 'ruta_del_modelo.keras' por la ruta real donde guardaste tu modelo
loaded_model = load_trained_model('/content/LABin_best_model_2025-05-30_15:26:47.keras')

# 3. PROBAR UN DOMINIO INDIVIDUAL
result = predict_single_domain(loaded_model, 'sadfdfdsfasds.com')
print(f"Dominio: {result['domain']}")
print(f"Predicción: {result['prediction']}")
print(f"Confianza: {result['confidence']:.4f}")

Modelo cargado desde: /content/LABin_best_model_2025-05-30_15:26:47.keras
Dominio: sadfdfdsfasds.com
Predicción: DGA
Confianza: 0.9111


In [18]:
import requests
import pandas as pd
import numpy as np
import time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import sys
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
import re

families = [
    'matsnu.gz',
    'suppobox.gz',
    'charbot.gz',
    'gozi.gz',
    'manuelita.gz',
    'rovnix.gz',
    'deception.gz',
    'nymaim.gz'
]

runs = 30

for family in families:
    print(family)
    dga = pd.read_csv(f'/content/drive/My Drive/Familias_Test/{family}', chunksize=50)
    legit = pd.read_csv('/content/drive/My Drive/Familias_Test/legit.gz', chunksize=50)
    dfs = []
    for run in range(runs):
        print(f'{run:2}/{runs}', end='\r')
        dfw = pd.concat([dga.get_chunk(), legit.get_chunk()])
        pred = []
        prob = []
        query_time = []
        results = []

        for domain_to_check in dfw.domain.values:
            st = time.time()

            result = predict_single_domain(loaded_model, domain_to_check)
            if result['prediction'] == "DGA":
                label_value = 1
            else:
                label_value = 0

            pred.append(label_value)
            query_time.append(time.time() - st)

        dfw['pred'] = pred
        # dfw['prob'] = prob  # Si tienes probabilidades, descomenta esta línea
        dfw['query_time'] = query_time
        dfw.to_csv(f'/content/drive/My Drive/results/results_Labin_{family}_{run}.csv.gz', index=False)


matsnu.gz
suppobox.gz
charbot.gz
gozi.gz
manuelita.gz
rovnix.gz
deception.gz
nymaim.gz


In [19]:
import requests
import pandas as pd
import numpy as np
import time
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import sys
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
import re

families = ['bigviktor.gz',
            'pizd.gz',
            'ngioweb.gz'

           ]

runs = 30

for family in families:
    print(family)
    dga = pd.read_csv(f'/content/drive/My Drive/New_Families/{family}', chunksize=50)
    legit = pd.read_csv('/content/drive/My Drive/Familias_Test/legit.gz', chunksize=50)
    dfs = []

    # Saltar los primeros 30 chunks de legit
    for _ in range(30):
        legit.get_chunk()



    for run in range(runs):
        print(f'{run:2}/{runs}', end='\r')
        dfw = pd.concat([dga.get_chunk(), legit.get_chunk()])
        pred = []
        prob = []
        query_time = []
        results = []

        for domain_to_check in dfw.domain.values:
            st = time.time()
            result = predict_single_domain(loaded_model, domain_to_check)
            if result['prediction'] == "DGA":
                label_value = 1
            else:
                label_value = 0

            pred.append(label_value)
            query_time.append(time.time() - st)

        dfw['pred'] = pred
        # dfw['prob'] = prob  # Si tienes probabilidades, descomenta esta línea
        dfw['query_time'] = query_time
        dfw.to_csv(f'/content/drive/My Drive/results/results_Labin_{family}_{run}.csv.gz', index=False)


bigviktor.gz
pizd.gz
ngioweb.gz


In [20]:
#"""
families = [
    'matsnu.gz',
    'suppobox.gz',
    'charbot.gz',
    'gozi.gz',
    'manuelita.gz',
    'rovnix.gz',
    'deception.gz',
    'nymaim.gz',
    'bigviktor.gz',
    'pizd.gz',
    'ngioweb.gz'
]
#"""
def fpr_tpr(y, ypred):
    tn, fp, fn, tp = confusion_matrix(y, ypred).ravel()
    fpr = fp / (fp + tn)  # False Positive Rate
    tpr = tp / (tp + fn)  # True Positive Rate (Recall)
    return fpr, tpr

for family in families:
    acc = []
    pre = []
    rec = []
    f1 = []
    fpr = []
    tpr = []
    qt = []
    qts = []
    for run in range(runs):
        df = pd.read_csv(f'/content/drive/My Drive/results/results_Labin_{family}_{run}.csv.gz')
        y = (df.label == 'dga').astype(int)
        ypred = df.pred
        acc.append(accuracy_score(y, ypred))
        pre.append(precision_score(y, ypred))
        rec.append(recall_score(y, ypred))
        f1.append(f1_score(y, ypred))
        fpr_value, tpr_value = fpr_tpr(y, ypred)
        fpr.append(fpr_value)
        tpr.append(tpr_value)
        qt.append(df.query_time.mean())
        qts.append(df.query_time.std())
#    print(f'Query time: {np.mean(qt):0.5f}+/-{np.mean(qts)}:0.5f')
    print(f'{family.split(".")[0]:15}: acc:{np.mean(acc):0.2f}±{np.std(acc):.3f} f1:{np.mean(f1):0.2f}±{np.std(f1):.3f} pre:{np.mean(pre):0.2f}±{np.std(pre):.3f} rec:{np.mean(rec):0.2f}±{np.std(rec):.3f}  FPR:{np.mean(fpr):0.2f}±{np.std(fpr):.3f} TPR:{np.mean(tpr):0.2f}±{np.std(tpr):.3f} Query time: {np.mean(qt):0.5f}±{np.mean(qts):0.5f}')


matsnu         : acc:0.93±0.032 f1:0.93±0.028 pre:0.89±0.046 rec:0.97±0.018  FPR:0.12±0.059 TPR:0.97±0.018 Query time: 0.08699±0.03077
suppobox       : acc:0.94±0.031 f1:0.94±0.027 pre:0.89±0.045 rec:1.00±0.012  FPR:0.12±0.059 TPR:1.00±0.012 Query time: 0.07804±0.02411
charbot        : acc:0.84±0.044 f1:0.83±0.046 pre:0.87±0.055 rec:0.79±0.051  FPR:0.12±0.059 TPR:0.79±0.051 Query time: 0.07832±0.02187
gozi           : acc:0.85±0.054 f1:0.84±0.056 pre:0.87±0.054 rec:0.81±0.080  FPR:0.12±0.059 TPR:0.81±0.080 Query time: 0.07945±0.02210
manuelita      : acc:0.52±0.036 f1:0.24±0.064 pre:0.57±0.131 rec:0.15±0.047  FPR:0.12±0.059 TPR:0.15±0.047 Query time: 0.07936±0.02168
rovnix         : acc:0.93±0.029 f1:0.94±0.025 pre:0.89±0.045 rec:0.98±0.017  FPR:0.12±0.059 TPR:0.98±0.017 Query time: 0.07933±0.02181
deception      : acc:0.94±0.030 f1:0.94±0.026 pre:0.90±0.045 rec:1.00±0.000  FPR:0.12±0.059 TPR:1.00±0.000 Query time: 0.08005±0.02218
nymaim         : acc:0.88±0.036 f1:0.88±0.034 pre:0.88±