In [1]:
from google.colab import drive  # Importing the library to mount Google Drive
drive.mount('/content/drive')  # Mounting Google Drive in Colab environment

Mounted at /content/drive


In [2]:
import pandas as pd

# File paths
train_df_file = "/content/drive/My Drive/MOE_DGA/train_wl.csv"

train_df = pd.read_csv(train_df_file)

train_df = train_df.rename(columns={"label": "Label"})


print(train_df)

                       domain    family   Label
0         nailconsiderable.ru  suppobox     dga
1            stilldelight.net  suppobox     dga
2       kimberleekatheryn.net  suppobox     dga
3                soilbeen.net  suppobox     dga
4               visitform.net  suppobox     dga
...                       ...       ...     ...
159995             dhuhaa.com     legit  notdga
159996        sdmetalcrew.org     legit  notdga
159997  melbcampcontuligol.ga     legit  notdga
159998      pl-enthusiast.net     legit  notdga
159999            rd-forum.ru     legit  notdga

[160000 rows x 3 columns]


In [3]:
# 📦 Instalar PyTorch si hace falta (Colab ya lo trae normalmente)
# !pip install torch torchvision scikit-learn pandas

# 📚 1. Importar librerías
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import string

# 📐 2. Definir preprocesamiento de texto
CHARS = string.ascii_lowercase + string.digits + "-._"
CHAR2IDX = {c: i+1 for i, c in enumerate(CHARS)}  # 0 para padding
MAXLEN = 75  # Longitud máxima del dominio

def encode_domain(domain):
    domain = domain.lower()
    return [CHAR2IDX.get(c, 0) for c in domain[:MAXLEN]] + [0] * (MAXLEN - len(domain))

# 🧹 3. Dataset personalizado
class DGADataset(Dataset):
    def __init__(self, df):
        self.domains = [encode_domain(d) for d in df["domain"]]
        self.labels = [1 if label == "dga" else 0 for label in df["Label"]]

    def __len__(self):
        return len(self.domains)

    def __getitem__(self, idx):
        return torch.tensor(self.domains[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# 🧠 4. Modelo CNN
class DGACNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(embedding_dim, 64, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(64 * (MAXLEN // 2), num_classes)

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = self.pool(self.relu(self.conv1(x)))
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)

# 🏋️‍♂️ 5. Función de entrenamiento
def train_model(model, dataloader, epochs=3, lr=1e-3):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()

    for epoch in range(epochs):
        total_loss, correct = 0, 0
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (outputs.argmax(1) == y_batch).sum().item()

        acc = correct / len(dataloader.dataset)
        print(f"📈 Epoch {epoch+1}: Loss={total_loss:.4f}, Accuracy={acc:.4f}")

# 🧪 6. Evaluación
def evaluate_model(model, dataloader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            preds = outputs.argmax(dim=1)
            y_true.extend(y.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())

    print("\n📊 Classification Report:\n")
    print(classification_report(y_true, y_pred, target_names=["notdga", "dga"]))

# ⚙️ 7. Preparar datos (cargar tu DataFrame aquí)
# 👇 Reemplaza esto con tu método real para cargar train_df
# train_df = pd.read_csv("tu_archivo.csv")
# o si ya está en memoria, asegúrate de que se llame 'train_df'

train_df = train_df.rename(columns={"Labels": "Label"})  # Normalizar nombre de columna

train_data, test_data = train_test_split(train_df, test_size=0.02, stratify=train_df["Label"], random_state=42)
train_loader = DataLoader(DGADataset(train_data), batch_size=64, shuffle=True)
test_loader = DataLoader(DGADataset(test_data), batch_size=64)

# 🚀 8. Entrenar y evaluar
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DGACNN(vocab_size=len(CHAR2IDX)+1).to(device)

train_model(model, train_loader, epochs=50)
evaluate_model(model, test_loader)

# 💾 9. Guardar modelo entrenado (opcional)
torch.save(model.state_dict(), "dga_cnn_model_wl.pth")



📈 Epoch 1: Loss=1227.4861, Accuracy=0.7598
📈 Epoch 2: Loss=1111.8431, Accuracy=0.7938
📈 Epoch 3: Loss=1085.7899, Accuracy=0.8011
📈 Epoch 4: Loss=1065.8432, Accuracy=0.8062
📈 Epoch 5: Loss=1053.0915, Accuracy=0.8091
📈 Epoch 6: Loss=1043.4482, Accuracy=0.8117
📈 Epoch 7: Loss=1037.5315, Accuracy=0.8147
📈 Epoch 8: Loss=1029.9196, Accuracy=0.8156
📈 Epoch 9: Loss=1029.8009, Accuracy=0.8168
📈 Epoch 10: Loss=1021.3644, Accuracy=0.8175
📈 Epoch 11: Loss=1018.4461, Accuracy=0.8186
📈 Epoch 12: Loss=1015.0625, Accuracy=0.8189
📈 Epoch 13: Loss=1011.4315, Accuracy=0.8195
📈 Epoch 14: Loss=1007.4491, Accuracy=0.8208
📈 Epoch 15: Loss=1008.6686, Accuracy=0.8221
📈 Epoch 16: Loss=1002.7266, Accuracy=0.8218
📈 Epoch 17: Loss=1003.0112, Accuracy=0.8230
📈 Epoch 18: Loss=1001.5669, Accuracy=0.8233
📈 Epoch 19: Loss=998.4341, Accuracy=0.8238
📈 Epoch 20: Loss=999.5484, Accuracy=0.8229
📈 Epoch 21: Loss=996.5214, Accuracy=0.8238
📈 Epoch 22: Loss=995.0207, Accuracy=0.8244
📈 Epoch 23: Loss=990.1331, Accuracy=0.8265
📈 

In [4]:
def predict_domain(model, domain_name):
    model.eval()
    encoded = encode_domain(domain_name)
    input_tensor = torch.tensor([encoded], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        prediction = torch.argmax(output, dim=1).item()

    return "dga" if prediction == 1 else "notdga"


In [5]:
# Probar un dominio
test_domain = "marca.com"
result = predict_domain(model, test_domain)
print(f"🔍 El dominio '{test_domain}' fue clasificado como: {result.upper()}")


🔍 El dominio 'marca.com' fue clasificado como: NOTDGA


In [6]:
import time
from torch.utils.data import DataLoader
import gzip

def predict_batch_with_timing(model, domains):
    model.eval()
    predictions = []
    timings = []

    for domain in domains:
        start_time = time.time()

        # Codificar y predecir dominio individual
        encoded = encode_domain(domain)
        inputs = torch.tensor([encoded], dtype=torch.long).to(device)

        with torch.no_grad():
            outputs = model(inputs)
            pred = outputs.argmax(dim=1).cpu().numpy()[0]

        end_time = time.time()

        predictions.append(pred)
        timings.append(end_time - start_time)

    return predictions, timings

# Código principal modificado
families = [
    'matsnu.gz',
    'suppobox.gz',
    'charbot.gz',
    'gozi.gz',
    'manuelita.gz',
    'rovnix.gz',
    'deception.gz',
    'nymaim.gz'
]

runs = 30
for family in families:
    print(f"🔍 Procesando familia: {family}")
    dga_reader = pd.read_csv(f'/content/drive/My Drive/Familias_Test/{family}', chunksize=50)
    legit_reader = pd.read_csv('/content/drive/My Drive/Familias_Test/legit.gz', chunksize=50)

    for run in range(runs):
        print(f" ▶️ Run {run+1}/{runs}", end="\r")
        dga_chunk = dga_reader.get_chunk()
        legit_chunk = legit_reader.get_chunk()
        df_chunk = pd.concat([dga_chunk, legit_chunk]).reset_index(drop=True)

        # Obtener predicciones y tiempos
        preds, times = predict_batch_with_timing(model, df_chunk["domain"].values)

        df_chunk["pred"] = preds
        df_chunk["query_time"] = times  # ✅ Tiempo por dominio

        df_chunk.to_csv(
            f"/content/drive/My Drive/results/results_CNN_PyTorch_{family}_{run}.csv.gz",
            index=False,
            compression="gzip"
        )


🔍 Procesando familia: matsnu.gz
🔍 Procesando familia: suppobox.gz
🔍 Procesando familia: charbot.gz
🔍 Procesando familia: gozi.gz
🔍 Procesando familia: manuelita.gz
🔍 Procesando familia: rovnix.gz
🔍 Procesando familia: deception.gz
🔍 Procesando familia: nymaim.gz


In [7]:
from torch.utils.data import DataLoader
import gzip

def predict_batch_with_timing(model, domains):
    model.eval()
    predictions = []
    timings = []

    for domain in domains:
        start_time = time.time()

        # Codificar y predecir dominio individual
        encoded = encode_domain(domain)
        inputs = torch.tensor([encoded], dtype=torch.long).to(device)

        with torch.no_grad():
            outputs = model(inputs)
            pred = outputs.argmax(dim=1).cpu().numpy()[0]

        end_time = time.time()

        predictions.append(pred)
        timings.append(end_time - start_time)

    return predictions, timings



families = ['bigviktor.gz',
            'pizd.gz',
            'ngioweb.gz'


           ]

runs = 30
for family in families:
    print(f"🔍 Procesando familia: {family}")

    dga_reader = pd.read_csv(f'/content/drive/My Drive/New_Families/{family}', chunksize=50)
    legit_reader = pd.read_csv('/content/drive/My Drive/Familias_Test/legit.gz', chunksize=50)

    # Saltar los primeros 30 chunks de legit
    for _ in range(30):
        legit_reader.get_chunk()

    for run in range(runs):
        print(f" ▶️ Run {run+1}/{runs}", end="\r")
        dga_chunk = dga_reader.get_chunk()
        legit_chunk = legit_reader.get_chunk()
        df_chunk = pd.concat([dga_chunk, legit_chunk]).reset_index(drop=True)

        # Obtener predicciones y tiempos
        preds, times = predict_batch_with_timing(model, df_chunk["domain"].values)

        df_chunk["pred"] = preds
        df_chunk["query_time"] = times  # ✅ Tiempo por dominio

        df_chunk.to_csv(
            f"/content/drive/My Drive/results/results_CNN_PyTorch_{family}_{run}.csv.gz",
            index=False,
            compression="gzip"
        )






🔍 Procesando familia: bigviktor.gz
🔍 Procesando familia: pizd.gz
🔍 Procesando familia: ngioweb.gz


In [8]:
families = [
    'matsnu.gz',
    'suppobox.gz',
    'charbot.gz',
    'gozi.gz',
    'manuelita.gz',
    'rovnix.gz',
    'deception.gz',
    'nymaim.gz',
    'bigviktor.gz',
    'pizd.gz',
    'ngioweb.gz'
]

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
import numpy as np
import pandas as pd

def fpr_tpr(y, ypred):
    tn, fp, fn, tp = confusion_matrix(y, ypred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    return fpr, tpr

# Listas para métricas globales
all_acc, all_pre, all_rec, all_f1 = [], [], [], []
all_fpr, all_tpr, all_qt, all_qts = [], [], [], []
total_unknowns_global = 0


for family in families:
    acc = []
    pre = []
    rec = []
    f1 = []
    fpr = []
    tpr = []
    qt = []
    qts = []
    total_unknowns = 0
    for run in range(runs):
        path = f'/content/drive/My Drive/results/results_CNN_PyTorch_{family}_{run}.csv.gz'
        df = pd.read_csv(path)
        #print(df)
        y_true = (df["label"] == 'dga').astype(int)
        y_pred = df["pred"]

                # Métricas
        acc.append(accuracy_score(y_true, y_pred))
        pre.append(precision_score(y_true, y_pred, zero_division=0))
        rec.append(recall_score(y_true, y_pred, zero_division=0))
        f1.append(f1_score(y_true, y_pred, zero_division=0))
        fpr_val, tpr_val = fpr_tpr(y_true, y_pred)
        fpr.append(fpr_val)
        tpr.append(tpr_val)

        if 'query_time' in df.columns:
            qt.append(df['query_time'].mean())
            qts.append(df['query_time'].std())

    # Promedios por familia
    if acc:  # solo si hubo archivos válidos
        print(f'{family.split(".")[0]:15}: '
              f'acc:{np.mean(acc):.2f}±{np.std(acc):.3f} '
              f'f1:{np.mean(f1):.2f}±{np.std(f1):.3f} '
              f'pre:{np.mean(pre):.2f}±{np.std(pre):.3f} '
              f'rec:{np.mean(rec):.2f}±{np.std(rec):.3f} '
              f'FPR:{np.mean(fpr):.2f}±{np.std(fpr):.3f} '
              f'TPR:{np.mean(tpr):.2f}±{np.std(tpr):.3f} '
              f'QT:{np.mean(qt):.5f}±{np.std(qt):.5f} '
              f'Unknowns: {total_unknowns}')

        all_acc.append(np.mean(acc))
        all_pre.append(np.mean(pre))
        all_rec.append(np.mean(rec))
        all_f1.append(np.mean(f1))
        all_fpr.append(np.mean(fpr))
        all_tpr.append(np.mean(tpr))
        all_qt.append(np.mean(qt))
        all_qts.append(np.mean(qts))
        total_unknowns_global += total_unknowns

# 🔍 Métricas globales
print("\n### 📊 Métricas globales ###")
print(f'Accuracy   : {np.mean(all_acc):.2f}')
print(f'F1-Score   : {np.mean(all_f1):.2f}')
print(f'Precision  : {np.mean(all_pre):.2f}')
print(f'Recall     : {np.mean(all_rec):.2f}')
print(f'FPR        : {np.mean(all_fpr):.2f}')
print(f'TPR        : {np.mean(all_tpr):.2f}')
print(f'Query time : {np.mean(all_qt):.5f} ± {np.mean(all_qts):.5f}')
print(f'Total unknown classifications: {total_unknowns_global}')


matsnu         : acc:0.90±0.029 f1:0.90±0.026 pre:0.86±0.042 rec:0.95±0.033 FPR:0.15±0.055 TPR:0.95±0.033 QT:0.00043±0.00006 Unknowns: 0
suppobox       : acc:0.92±0.027 f1:0.93±0.024 pre:0.87±0.041 rec:1.00±0.004 FPR:0.15±0.055 TPR:1.00±0.004 QT:0.00043±0.00004 Unknowns: 0
charbot        : acc:0.80±0.037 f1:0.79±0.039 pre:0.83±0.051 rec:0.76±0.051 FPR:0.15±0.055 TPR:0.76±0.051 QT:0.00043±0.00003 Unknowns: 0
gozi           : acc:0.81±0.060 f1:0.80±0.071 pre:0.83±0.055 rec:0.77±0.110 FPR:0.15±0.055 TPR:0.77±0.110 QT:0.00052±0.00008 Unknowns: 0
manuelita      : acc:0.50±0.038 f1:0.23±0.060 pre:0.50±0.134 rec:0.15±0.041 FPR:0.15±0.055 TPR:0.15±0.041 QT:0.00049±0.00011 Unknowns: 0
rovnix         : acc:0.92±0.030 f1:0.92±0.026 pre:0.87±0.042 rec:0.99±0.014 FPR:0.15±0.055 TPR:0.99±0.014 QT:0.00044±0.00006 Unknowns: 0
deception      : acc:0.92±0.028 f1:0.92±0.024 pre:0.87±0.042 rec:0.99±0.012 FPR:0.15±0.055 TPR:0.99±0.012 QT:0.00042±0.00003 Unknowns: 0
nymaim         : acc:0.82±0.043 f1:0.82±0