<a href="https://colab.research.google.com/github/rohitpan/datasciencecoursera/blob/master/DGN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# DGN using ngram tfidf feature vector then reduced to 100 dim and using RandomForest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import TruncatedSVD

# Step 1: Load the dataset
df = pd.read_csv('./dga_data.csv')  # Replace with the correct path to your CSV file

# Step 2: Preprocess the data
df = df.dropna(subset=['domain'])  # Remove rows where 'domain' is NaN
df['isDGA'] = df['isDGA'].apply(lambda x: 1 if x == 'dga' else 0)  # Convert 'dga' to 1 and 'legit' to 0

# Step 3: Generate TF-IDF features
tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), max_features=5000)  # Limit to 5000 features
X = tfidf_vectorizer.fit_transform(df['domain'])
y = df['isDGA']

# Optional: Dimensionality Reduction
svd = TruncatedSVD(n_components=100, random_state=42)
X = svd.fit_transform(X)

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train a Machine Learning Model
model = RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.81603125
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81     16123
           1       0.80      0.84      0.82     15877

    accuracy                           0.82     32000
   macro avg       0.82      0.82      0.82     32000
weighted avg       0.82      0.82      0.82     32000



In [8]:

# NN with feature enginnering
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
df = pd.read_csv('./dga_data.csv')  # Replace with the correct path to your CSV file

# Step 2: Preprocess the data
data = df.dropna(subset=['domain'])  # Remove rows where 'domain' is NaN
data['isDGA'] = data['isDGA'].apply(lambda x: 1 if x == 'dga' else 0)  # Convert 'dga' to 1 and 'legit' to 0



# Feature Engineering Functions
def calculate_entropy(domain):
    prob = [float(domain.count(c)) / len(domain) for c in dict.fromkeys(list(domain))]
    entropy = -sum([p * np.log2(p) for p in prob])
    return entropy

# Feature Engineering
data['domain_length'] = data['domain'].apply(len)
data['vowel_ratio'] = data['domain'].apply(lambda x: sum(1 for c in x if c in 'aeiouAEIOU') / len(x))
data['consonant_ratio'] = data['domain'].apply(lambda x: sum(1 for c in x if c.isalpha() and c not in 'aeiouAEIOU') / len(x))
data['digit_ratio'] = data['domain'].apply(lambda x: sum(1 for c in x if c.isdigit()) / len(x))
data['hexadecimal_ratio'] = data['domain'].apply(lambda x: sum(1 for c in x if c in '0123456789abcdefABCDEF') / len(x))
data['entropy'] = data['domain'].apply(calculate_entropy)
#'isDGA', 'domain', 'host', 'subclass'
# Ensure we're only using the numeric feature-engineered columns for training
X = data.drop(columns=['domain', 'isDGA','host','subclass']).values
y = data['isDGA'].values

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print out to debug what's inside X_train before conversion to tensor
print("column names",data.columns)
print("X_train sample:", X_train[:5])

# Convert data to PyTorch tensors for wide path
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Define Dataset for PyTorch
class DGADataset(Dataset):
    def __init__(self, X_wide, y):
        self.X_wide = X_wide
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_wide[idx], self.y[idx]

train_dataset = DGADataset(X_train_tensor, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define the PyTorch model (Wide Part Only)
class WideModel(nn.Module):
    def __init__(self, wide_features_dim):
        super(WideModel, self).__init__()

        # Dense layer for the wide path
        self.wide_layer = nn.Linear(wide_features_dim, 64)

        # Dense output layer
        self.output_layer = nn.Linear(64, 1)

    def forward(self, X_wide):
        # Wide path
        X_wide = self.wide_layer(X_wide)
        X_wide = F.relu(X_wide)  # Apply ReLU activation

        # Output layer
        output = self.output_layer(X_wide)
        return torch.sigmoid(output)

# Hyperparameters
wide_features_dim = X_train_tensor.shape[1]

# Initialize model, loss, and optimizer
model = WideModel(wide_features_dim)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_wide_batch, y_batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(X_wide_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_dataloader)}')

print("Training complete!")

# Step 6: Evaluate the Model
model.eval()
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor).squeeze()
    y_pred = (y_pred_tensor > 0.5).float().numpy()  # Convert to binary 0/1 predictions

# Calculate accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))








A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['isDGA'] = data['isDGA'].apply(lambda x: 1 if x == 'dga' else 0)  # Convert 'dga' to 1 and 'legit' to 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['domain_length'] = data['domain'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['vowel_ratio'] = data['domain'].apply(lam

column names Index(['isDGA', 'domain', 'host', 'subclass', 'domain_length', 'vowel_ratio',
       'consonant_ratio', 'digit_ratio', 'hexadecimal_ratio', 'entropy'],
      dtype='object')
X_train sample: [[15.          0.33333333  0.66666667  0.          0.4         3.32323143]
 [ 5.          0.2         0.8         0.          0.4         2.32192809]
 [15.          0.4         0.6         0.          0.33333333  3.1068906 ]
 [18.          0.16666667  0.83333333  0.          0.11111111  3.68354236]
 [ 9.          0.          1.          0.          0.11111111  2.72548056]]
Epoch 1/5, Loss: 0.33524478199561014
Epoch 2/5, Loss: 0.30180840441679607
Epoch 3/5, Loss: 0.2960024307337315
Epoch 4/5, Loss: 0.2936482599378509
Epoch 5/5, Loss: 0.2924516286178885
Training complete!
Accuracy: 0.87065625
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.97      0.88     16123
           1       0.96      0.77      0.86     15877

    accuracy

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
df = pd.read_csv('./dga_data.csv')  # Replace with the correct path to your CSV file

# Step 2: Preprocess the data
data = df.dropna(subset=['domain'])  # Remove rows where 'domain' is NaN
data['isDGA'] = data['isDGA'].apply(lambda x: 1 if x == 'dga' else 0)  # Convert 'dga' to 1 and 'legit' to 0

def calculate_entropy(domain):
    prob = [float(domain.count(c)) / len(domain) for c in dict.fromkeys(list(domain))]
    entropy = -sum([p * np.log2(p) for p in prob])
    return entropy

# Step 3: Feature Engineering
data['domain_length'] = data['domain'].apply(len)
data['vowel_ratio'] = data['domain'].apply(lambda x: sum(1 for c in x if c in 'aeiouAEIOU') / len(x))
data['consonant_ratio'] = data['domain'].apply(lambda x: sum(1 for c in x if c.isalpha() and c not in 'aeiouAEIOU') / len(x))
data['digit_ratio'] = data['domain'].apply(lambda x: sum(1 for c in x if c.isdigit()) / len(x))
data['hexadecimal_ratio'] = data['domain'].apply(lambda x: sum(1 for c in x if c in '0123456789abcdefABCDEF') / len(x))
data['entropy'] = data['domain'].apply(calculate_entropy)

# Step 4: Prepare data for PyTorch
X_wide = data.drop(columns=['domain', 'isDGA', 'host', 'subclass']).values
X_deep = data['domain'].values  # Handle domain strings for the deep part
y = data['isDGA'].values

# Split data into training and test sets
X_wide_train, X_wide_test, X_deep_train, X_deep_test, y_train, y_test = train_test_split(
    X_wide, X_deep, y, test_size=0.2, random_state=42)

# Convert X_wide and y into PyTorch tensors
X_wide_train_tensor = torch.tensor(X_wide_train, dtype=torch.float32)
X_wide_test_tensor = torch.tensor(X_wide_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Step 5: Define Dataset class for PyTorch
class DGADataset(Dataset):
    def __init__(self, X_wide, X_deep, y):
        self.X_wide = X_wide
        self.X_deep = X_deep
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        # Convert X_deep (domain names) into tensors of ordinal values
        X_deep_tensor = torch.tensor([ord(c) for c in self.X_deep[idx]], dtype=torch.long)
        return self.X_wide[idx], X_deep_tensor, self.y[idx]

# Step 6: Custom collate_fn to pad sequences in X_deep_batch
def collate_fn(batch):
    X_wide_batch, X_deep_batch, y_batch = zip(*batch)

    # Convert X_wide_batch to tensor using torch.stack
    X_wide_batch = torch.stack([torch.tensor(x_wide, dtype=torch.float32) for x_wide in X_wide_batch])

    # Convert y_batch to tensor
    y_batch = torch.tensor(y_batch, dtype=torch.float32)

    # Pad X_deep_batch sequences to the maximum length in this batch
    X_deep_batch_padded = nn.utils.rnn.pad_sequence(X_deep_batch, batch_first=True, padding_value=0)

    return X_wide_batch, X_deep_batch_padded, y_batch

# Create train dataset and dataloader with custom collate_fn
train_dataset = DGADataset(X_wide_train_tensor, X_deep_train, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Define the PyTorch model
class WideDeepModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, wide_features_dim):
        super(WideDeepModel, self).__init__()

        # Embedding layer for the deep path
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # LSTM layer for the deep path
        self.lstm = nn.LSTM(embed_dim, 64, batch_first=True)

        # Dense layer for the wide path
        self.wide_layer = nn.Linear(wide_features_dim, 64)

        # Dense output layer
        self.output_layer = nn.Linear(64 + 64, 1)

    def forward(self, X_wide, X_deep):
        # Deep path
        X_deep = self.embedding(X_deep)
        X_deep, _ = self.lstm(X_deep)
        X_deep = X_deep[:, -1, :]  # Use last output of LSTM

        # Wide path
        X_wide = self.wide_layer(X_wide)

        # Combine wide and deep paths
        X_combined = torch.cat((X_wide, X_deep), dim=1)

        # Output layer
        output = self.output_layer(X_combined)
        return torch.sigmoid(output)

# Hyperparameters
vocab_size = 256  # Assuming ASCII characters
embed_dim = 128
wide_features_dim = X_wide_train_tensor.shape[1]

# Initialize model, loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = WideDeepModel(vocab_size, embed_dim, wide_features_dim).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 7: Training loop
epochs = 20
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for X_wide_batch, X_deep_batch, y_batch in train_dataloader:
        X_wide_batch, X_deep_batch, y_batch = X_wide_batch.to(device), X_deep_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_wide_batch, X_deep_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_dataloader)}')

print("Training complete!")

# Step 8: Evaluate the Model
test_dataset = DGADataset(X_wide_test_tensor, X_deep_test, y_test_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

model.eval()
with torch.no_grad():
    y_pred_list = []
    y_test_list = []
    for X_wide_batch, X_deep_batch, y_batch in test_dataloader:
        X_wide_batch, X_deep_batch = X_wide_batch.to(device), X_deep_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = model(X_wide_batch, X_deep_batch).squeeze()
        y_pred = (outputs > 0.5).float()
        y_pred_list.append(y_pred.cpu().numpy())
        y_test_list.append(y_batch.cpu().numpy())

# Convert prediction lists to numpy arrays for accuracy computation
y_pred_final = np.concatenate(y_pred_list)
y_test_final = np.concatenate(y_test_list)

# Step 9: Calculate accuracy and classification report
print("Accuracy:", accuracy_score(y_test_final, y_pred_final))
print("Classification Report:")
print(classification_report(y_test_final, y_pred_final))


  X_wide_batch = torch.stack([torch.tensor(x_wide, dtype=torch.float32) for x_wide in X_wide_batch])


Epoch 1/20, Loss: 0.004032328282957473
Epoch 2/20, Loss: 3.7828380246698065e-05
Epoch 3/20, Loss: 1.3945251012834773e-05
Epoch 4/20, Loss: 6.679219688119981e-06
Epoch 5/20, Loss: 3.4957971595376855e-06
Epoch 6/20, Loss: 1.9066103509576313e-06
Epoch 7/20, Loss: 1.1010871380561713e-06
Epoch 8/20, Loss: 6.500708318230793e-07
Epoch 9/20, Loss: 3.905886675099711e-07
Epoch 10/20, Loss: 2.3148660296913648e-07
Epoch 11/20, Loss: 1.35422065459644e-07
Epoch 12/20, Loss: 8.012673984834656e-08
Epoch 13/20, Loss: 4.3150762523747905e-08
Epoch 14/20, Loss: 2.701238567352239e-08
Epoch 15/20, Loss: 1.79582306846466e-08
Epoch 16/20, Loss: 1.061529274160045e-08
Epoch 17/20, Loss: 5.824371664015516e-09
Epoch 18/20, Loss: 2.2420378756163807e-09
Epoch 19/20, Loss: 1.0418514403451722e-09
Epoch 20/20, Loss: 8.573168442790183e-10
Training complete!
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00      7751

    accuracy     