## This is the code for taking extracted features from the Module 1 i.e. ViT B/16 + ResNet 50 + Calibration and then using DNN over it

In [None]:
'''
def get_req_set(path):
    df = pd.read_csv(path)
    features_df = df['features'].str.strip('[]').str.split(',', expand=True)
    features_df = features_df.astype(float)
    features_df.columns = [f'feature_{i}' for i in range(features_df.shape[1])]
    df_expanded = pd.concat([features_df, df['label']], axis=1)
    X = df_expanded.drop(columns=['label'])
    y = df_expanded['label']
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    dataset = TensorDataset(X_tensor, y_tensor)
    temp_loader = DataLoader(dataset, batch_size=32, shuffle=True)
    return temp_loader

'''

# Here this part extracts the Module 1 features saved in a csv and works over it 

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [None]:
# Load dataset and prepare DataLoader
def get_req_set(path):
    df = pd.read_csv(path)
    features_df = df['features'].str.strip('[]').str.split(',', expand=True)
    features_df = features_df.astype(float)
    features_df.columns = [f'feature_{i}' for i in range(features_df.shape[1])]
    df_expanded = pd.concat([features_df, df['label']], axis=1)
    X = df_expanded.drop(columns=['label'])
    y = df_expanded['label']
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    dataset = TensorDataset(X_tensor, y_tensor)
    temp_loader = DataLoader(dataset, batch_size=32, shuffle=True)
    return temp_loader

# Paths to datasets
train_loader = get_req_set('/scratch/user/nkolloju/GENAI/WildRF_Features/train_features.csv')
val_loader = get_req_set('/scratch/user/nkolloju/GENAI/WildRF_Features/val_features.csv')
test_loaders = {
    'Facebook': get_req_set('/scratch/user/nkolloju/GENAI/WildRF_Features/facebook_features.csv'),
    'Reddit': get_req_set('/scratch/user/nkolloju/GENAI/WildRF_Features/reddit_features.csv'),
    'Twitter': get_req_set('/scratch/user/nkolloju/GENAI/WildRF_Features/twitter_features.csv')
}

# Define the DNN model
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, output_dim, dropout_prob=0.2):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim_1)
        self.relu = nn.ReLU()
        self.dropout1 = nn.Dropout(p=dropout_prob)
        self.fc2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.dropout2 = nn.Dropout(p=dropout_prob)
        self.fc3 = nn.Linear(hidden_dim_2, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Training function
def train_one_epoch(model, loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    y_true, y_pred = [], []

    for batch_X, batch_y in loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == batch_y).sum().item()
        total_samples += batch_y.size(0)

        y_true.extend(batch_y.numpy())
        y_pred.extend(predicted.numpy())

    loss = running_loss / len(loader)
    accuracy = accuracy_score(y_true, y_pred) * 100
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    return loss, accuracy, precision, recall, f1

# Validation function
def validate(model, loader, criterion):
    model.eval()
    running_loss = 0.0
    y_true, y_pred = [], []

    with torch.no_grad():
        for batch_X, batch_y in loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            y_true.extend(batch_y.numpy())
            y_pred.extend(predicted.numpy())

    loss = running_loss / len(loader)
    accuracy = accuracy_score(y_true, y_pred) * 100
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    return loss, accuracy, precision, recall, f1

# Initialize model, criterion, and optimizer
input_dim = 768
hidden_dim_1 = 128
hidden_dim_2 = 256
output_dim = 2
model = DNN(input_dim, hidden_dim_1, hidden_dim_2, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Lists to store metrics
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

# Training loop
num_epochs = 30
patience = 3
best_val_loss = float('inf')
epochs_no_improve = 0

for epoch in range(num_epochs):
    train_loss, train_acc, train_prec, train_rec, train_f1 = train_one_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc, val_prec, val_rec, val_f1 = validate(model, val_loader, criterion)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch + 1}/{num_epochs}: "
          f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%, "
          f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.2f}%")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), "best_model_module1.pth")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

            
            # Plot loss and accuracy curves
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Curves')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy Curves')
plt.legend()

plt.tight_layout()
plt.savefig("training_validation_curves_module1_WildRf.png")
print("Training and validation curves saved as 'training_validation_curves.png'.")
# Test model on different test sets

model.load_state_dict(torch.load("best_model_module1.pth"))
print("\nTesting on individual test sets:")
for name, loader in test_loaders.items():
    test_loss, test_acc, test_prec, test_rec, test_f1 = validate(model, loader, criterion)
    print(f"{name} - Loss: {test_loss:.4f}, Accuracy: {test_acc:.2f}%, "
          f"Precision: {test_prec:.2f}, Recall: {test_rec:.2f}, F1 Score: {test_f1:.2f}")

Epoch 1/30: Train Loss: 0.6937, Train Accuracy: 51.95%, Val Loss: 0.6859, Val Accuracy: 50.00%
Epoch 2/30: Train Loss: 0.6128, Train Accuracy: 69.47%, Val Loss: 0.4327, Val Accuracy: 81.66%
Epoch 3/30: Train Loss: 0.3712, Train Accuracy: 84.11%, Val Loss: 0.3344, Val Accuracy: 86.93%
Epoch 4/30: Train Loss: 0.3193, Train Accuracy: 86.36%, Val Loss: 0.2890, Val Accuracy: 86.43%
Epoch 5/30: Train Loss: 0.2760, Train Accuracy: 88.90%, Val Loss: 0.2992, Val Accuracy: 87.69%
Epoch 6/30: Train Loss: 0.2777, Train Accuracy: 88.72%, Val Loss: 0.3000, Val Accuracy: 87.44%
Epoch 7/30: Train Loss: 0.2638, Train Accuracy: 89.75%, Val Loss: 0.2510, Val Accuracy: 90.20%
Epoch 8/30: Train Loss: 0.2217, Train Accuracy: 91.41%, Val Loss: 0.2758, Val Accuracy: 89.95%
Epoch 9/30: Train Loss: 0.2045, Train Accuracy: 92.85%, Val Loss: 0.2536, Val Accuracy: 89.95%
Epoch 10/30: Train Loss: 0.1924, Train Accuracy: 92.92%, Val Loss: 0.2656, Val Accuracy: 88.19%
Early stopping triggered.
Training and validation

  model.load_state_dict(torch.load("best_model_module2.pth"))


Reddit - Loss: 0.2400, Accuracy: 91.20%, Precision: 0.92, Recall: 0.90, F1 Score: 0.91
Twitter - Loss: 0.3862, Accuracy: 82.90%, Precision: 0.94, Recall: 0.79, F1 Score: 0.86
