In [1]:
!nvidia-smi  # Check GPU availability
!df -h      # Check disk space (Colab has ~100GB)

Wed Oct 15 11:17:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import os
import gdown
import pandas as pd
import json
from pathlib import Path
import subprocess

In [16]:
def setup_environment():
    """Mount Google Drive and create working directories"""
    from google.colab import drive
    drive.mount('/content/drive')

    # Create directory structure
    directories = [
        '/content/drive/MyDrive/mythesis/vicky/darpa_tc/raw',
        '/content/drive/MyDrive/mythesis/vicky/darpa_tc/processed',
        '/content/drive/MyDrive/mythesis/vicky/darpa_tc/features',
        '/content/drive/MyDrive/mythesis/vicky/darpa_tc/splits'
    ]

    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)

    print("✓ Environment setup complete")
    return directories

In [17]:
def download_darpa_engagement5(drive_folder_id, output_dir='/content/drive/MyDrive/mythesis/vicky/darpa_tc/raw'):
    """
    Download DARPA TC Engagement 5 dataset

    Args:
        drive_folder_id: ID from Google Drive URL
        output_dir: Local directory to save files
    """

    # For large datasets, download specific files rather than entire folder
    # You can list files first to see what's available

    print("Downloading DARPA TC Engagement 5...")

    # Option 1: Download entire folder (if permissions allow)
    try:
        gdown.download_folder(
            id=drive_folder_id,
            output=output_dir,
            quiet=False,
            use_cookies=False
        )
        print(f"✓ Dataset downloaded to {output_dir}")
    except Exception as e:
        print(f"Folder download failed: {e}")
        print("Try downloading individual files or use rclone method below")

In [19]:
def setup_rclone_transfer(gdrive_folder_url):
    """
    Setup rclone for efficient large file transfers
    More reliable than gdown for huge datasets
    """

    # Install rclone
    !curl https://rclone.org/install.sh | sudo bash

    # Configure rclone for Google Drive
    print("Configure rclone with your Google account:")
    print("1. Run: rclone config")
    print("2. Select 'n' for new remote")
    print("3. Name it 'gdrive'")
    print("4. Select Google Drive")
    print("5. Follow OAuth flow")
    print("\nThen use: rclone copy gdrive:path/to/dataset /content/drive/MyDrive/mythesis/vicky/darpa_tc/raw -P")


In [20]:
def process_darpa_logs_streaming(input_file, chunk_size=100000):
    """
    Process DARPA logs in streaming fashion to avoid memory issues
    DARPA TC uses JSON format for provenance graphs
    """

    processed_records = []

    print(f"Processing {input_file} in chunks of {chunk_size}...")

    with open(input_file, 'r') as f:
        chunk = []
        for i, line in enumerate(f):
            try:
                record = json.loads(line.strip())

                # Extract relevant features for APT detection
                processed = extract_features(record)
                chunk.append(processed)

                # Process in batches to manage memory
                if len(chunk) >= chunk_size:
                    df = pd.DataFrame(chunk)
                    save_processed_chunk(df, i // chunk_size)
                    chunk = []

                if i % 1000000 == 0:
                    print(f"Processed {i:,} records...")

            except json.JSONDecodeError:
                continue

    # Process remaining records
    if chunk:
        df = pd.DataFrame(chunk)
        save_processed_chunk(df, 'final')

    print("✓ Streaming processing complete")


In [21]:
def extract_features(record):
    """
    Extract security-relevant features from DARPA TC provenance records
    Aligned with your MARL + STL security framework
    """

    features = {
        # Temporal features
        'timestamp': record.get('timestamp_nanos', 0) / 1e9,

        # Process features
        'process_id': record.get('subject', {}).get('uuid', ''),
        'process_name': record.get('subject', {}).get('properties', {}).get('name', ''),

        # Network features (for container security)
        'src_ip': record.get('predicateObject', {}).get('properties', {}).get('src_address', ''),
        'dst_ip': record.get('predicateObject', {}).get('properties', {}).get('dst_address', ''),
        'dst_port': record.get('predicateObject', {}).get('properties', {}).get('dst_port', 0),

        # File operations
        'file_path': record.get('predicateObject2', {}).get('properties', {}).get('path', ''),
        'operation': record.get('type', ''),

        # Security-relevant flags
        'is_privileged': record.get('subject', {}).get('properties', {}).get('privileged', False),
        'sequence_id': record.get('sequence', 0)
    }

    return features

In [22]:
def create_temporal_splits(processed_dir, train_ratio=0.7, val_ratio=0.15):
    """
    Create temporally-aware splits (important for APT detection)
    Maintains chronological order to prevent data leakage
    """

    # Load all processed chunks
    all_files = sorted(Path(processed_dir).glob('*.parquet'))
    dfs = [pd.read_parquet(f) for f in all_files]
    df = pd.concat(dfs, ignore_index=True)

    # Sort by timestamp
    df = df.sort_values('timestamp')

    # Split temporally
    n = len(df)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))

    train_df = df.iloc[:train_end]
    val_df = df.iloc[train_end:val_end]
    test_df = df.iloc[val_end:]

    # Save splits
    train_df.to_parquet('/content/drive/MyDrive/mythesis/vicky/darpa_tc/splits/train.parquet')
    val_df.to_parquet('/content/drive/MyDrive/mythesis/vicky/darpa_tc/splits/val.parquet')
    test_df.to_parquet('/content/drive/MyDrive/mythesis/vicky/darpa_tc/splits/test.parquet')

    print(f"✓ Splits created:")
    print(f"  Train: {len(train_df):,} records")
    print(f"  Val:   {len(val_df):,} records")
    print(f"  Test:  {len(test_df):,} records")

    return train_df, val_df, test_df

In [23]:
def create_data_loader(split_file, batch_size=256):
    """
    Create memory-efficient data loader for MARL training
    Compatible with PyTorch/TensorRL
    """
    import torch
    from torch.utils.data import Dataset, DataLoader

    class DARPADataset(Dataset):
        def __init__(self, parquet_file):
            self.df = pd.read_parquet(parquet_file)

        def __len__(self):
            return len(self.df)

        def __getitem__(self, idx):
            row = self.df.iloc[idx]

            # Convert to tensors for MARL
            features = torch.tensor([
                row['dst_port'],
                row['is_privileged'],
                row['sequence_id']
                # Add more numeric features
            ], dtype=torch.float32)

            return features

    dataset = DARPADataset(split_file)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return loader

In [24]:
def main():
    """Complete data preparation pipeline"""

    print("=== DARPA TC Dataset Preparation Pipeline ===\n")

    # Step 1: Setup
    setup_environment()

    # Step 2: Download dataset
    # Extract folder ID from your Google Drive URL
    # https://drive.google.com/drive/folders/1okt4AYElyBohW4XiOBqmsvjwXsnUjLVf
    folder_id = "1okt4AYElyBohW4XiOBqmsvjwXsnUjLVf"

    print("\n⚠️  DARPA Engagement 5 is very large (100GB+)")
    print("Recommended approach:")
    print("1. Download a subset first (e.g., one day's data)")
    print("2. Test your pipeline")
    print("3. Scale to full dataset once validated\n")

    # Uncomment when ready to download
    # download_darpa_engagement5(folder_id)

    print("Pipeline setup complete!")
    print("Next steps:")
    print("1. Download subset of data")
    print("2. Run: process_darpa_logs_streaming('path/to/log.json')")
    print("3. Run: create_temporal_splits('/content/drive/MyDrive/mythesis/vicky/darpa_tc/processed')")
    print("4. Run: create_data_loader('/content/drive/MyDrive/mythesis/vicky/darpa_tc/splits/train.parquet')")

if __name__ == "__main__":
    main()

=== DARPA TC Dataset Preparation Pipeline ===

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Environment setup complete

⚠️  DARPA Engagement 5 is very large (100GB+)
Recommended approach:
1. Download a subset first (e.g., one day's data)
2. Test your pipeline
3. Scale to full dataset once validated

Pipeline setup complete!
Next steps:
1. Download subset of data
2. Run: process_darpa_logs_streaming('path/to/log.json')
3. Run: create_temporal_splits('/content/drive/MyDrive/mythesis/vicky/darpa_tc/processed')
4. Run: create_data_loader('/content/drive/MyDrive/mythesis/vicky/darpa_tc/splits/train.parquet')
