In [245]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import warnings

warnings.filterwarnings('ignore')

# Set random seeds for reproducibility.
np.random.seed(42)
random.seed(42)

In [246]:
# we can generate 1l row maybe.

## Configuration

In [247]:
# Log generation configuration.
NUM_DAYS = 9  # Generate data over 9 days.
LOGS_PER_DAY = 15000  # Average logs per day.
TOTAL_NORMAL_LOGS = NUM_DAYS * LOGS_PER_DAY  # Total normal logs.

# Anomaly configuration - spread over 3-4 day periods randomly.
TOTAL_SPIKES = 8  # Total spikes across 9 days (not evenly distributed).
TOTAL_CASCADES = 3  # Total cascade failures.
TOTAL_SECURITY_INCIDENTS = 2  # Total security incidents.
TOTAL_NEW_PATTERNS = 40  # Total new error patterns (not per day).

# Services in the system.
SERVICES = [
    'auth-service',
    'payment-service',
    'order-service',
    'inventory-service',
    'user-service',
    'notification-service',
    'api-gateway'
]

# Log levels.
LOG_LEVELS = ['INFO', 'DEBUG', 'WARN', 'ERROR', 'FATAL']

# Normal log level distribution (includes some expected errors).
NORMAL_LEVEL_WEIGHTS = [0.65, 0.20, 0.10, 0.045, 0.005]

# Anomaly types.
ANOMALY_TYPES = ['spike', 'cascade', 'new_pattern', 'resource_exhaustion', 'security_breach']

## Normal Log Templates

In [248]:
# Normal log message templates (including EXPECTED errors that are NOT anomalies).
NORMAL_TEMPLATES = {
    'INFO': [
        'Request processed successfully - user_id: {user_id}, duration: {duration}ms',
        'Authentication successful for user {user_id}',
        'Payment processed - transaction_id: {transaction_id}, amount: ${amount}',
        'Order created successfully - order_id: {order_id}',
        'Cache hit for key: {cache_key}',
        'Database query completed - rows: {rows}, duration: {duration}ms',
        'API request received - endpoint: {endpoint}, method: {method}',
        'User session created - session_id: {session_id}',
        'Email notification sent to {email}',
        'Inventory updated - item_id: {item_id}, quantity: {quantity}',
    ],
    'DEBUG': [
        'Entering function: {function_name}',
        'Cache miss for key: {cache_key}',
        'Query plan: {query_plan}',
        'Response payload size: {size} bytes',
        'Connection pool stats: active={active}, idle={idle}',
    ],
    'WARN': [
        'High memory usage detected: {memory}%',
        'Slow query detected - duration: {duration}ms',
        'Rate limit approaching for user {user_id}',
        'Connection retry attempt {attempt} of 3',
        'Cache eviction due to size limit',
        'Service latency above threshold: {duration}ms',
    ],
    # EXPECTED ERRORS (business logic, user errors - NOT anomalies).
    'ERROR': [
        'Authentication failed - invalid password for user {user_id}',
        'Validation error - missing required field: {field_name}',
        'Order failed - insufficient inventory for item {item_id}',
        'Payment declined - insufficient funds for user {user_id}',
        'Invalid request - malformed JSON in request body',
        'Resource not found - user_id {user_id} does not exist',
        'Session expired for user {user_id}',
        'Rate limit exceeded for user {user_id} - retry after {retry_after}s',
    ],
    # RARE but expected critical issues (NOT anomalies when isolated).
    'FATAL': [
        'Unhandled exception in request handler - {exception_type}',
    ]
}

## Anomalous Log Templates

In [249]:
# Anomalous log message templates (TRUE system anomalies).
# These represent UNUSUAL patterns, not just error-level logs.
ANOMALY_TEMPLATES = {
    # Error spike patterns.
    'spike_error': [
        'Database connection pool exhausted - max connections: {max_conn}',
        'Service unavailable - {service} not responding after {attempts} attempts',
        'Connection timeout to {host} - {error_msg}',
        'Failed to acquire database lock - deadlock detected',
    ],
    # Cascade failure patterns.
    'cascade': [
        'Circuit breaker opened for {service} - failure threshold exceeded',
        'Upstream service {service} unavailable - cascading failure',
        'Database cluster unreachable - all nodes down',
        'Message queue full - dropping messages',
    ],
    # New/unusual error patterns.
    'new_pattern': [
        'NullPointerException in {function_name} at line {line}',
        'OutOfMemoryError - heap space exceeded',
        'StackOverflowError in {function_name}',
        'Data corruption detected in table {table_name}',
        'Unexpected error code {error_code} from external API',
    ],
    # Resource exhaustion.
    'resource': [
        'Disk space critically low: {disk_space}% remaining',
        'Memory usage critical: {memory}% used',
        'CPU usage sustained above 95% for {duration} seconds',
        'File descriptor limit reached',
        'Thread pool exhausted - queue size: {queue_size}',
    ],
    # Security anomalies.
    'security': [
        'Suspicious activity detected from IP: {ip_address}',
        'Brute force attack detected - {attempts} failed login attempts',
        'Potential SQL injection attempt in query',
        'Unauthorized access attempt to admin endpoint',
        'Token validation failed - possible token forgery',
    ]
}

## Helper Functions

In [250]:
def generate_timestamp_from_datetime(dt: datetime) -> str:
    """
    Convert datetime to ISO format timestamp string.

    param dt: Datetime object.
    """
    return dt.isoformat() + 'Z'


def generate_normal_log(timestamp: datetime, service: str) -> dict:
    """
    Generate a normal log entry (including expected business errors).

    param timestamp: Log timestamp as datetime.
    param service: Service name.
    """
    level = random.choices(LOG_LEVELS, weights=NORMAL_LEVEL_WEIGHTS)[0]
    template = random.choice(NORMAL_TEMPLATES[level])
    
    # Generate realistic field values.
    message = template.format(
        user_id=random.randint(1000, 9999),
        duration=random.randint(10, 500),
        transaction_id=f'txn_{random.randint(100000, 999999)}',
        amount=round(random.uniform(10, 1000), 2),
        order_id=f'ord_{random.randint(100000, 999999)}',
        cache_key=f'cache_{random.randint(1, 100)}',
        rows=random.randint(1, 1000),
        endpoint=random.choice(['/api/users', '/api/orders', '/api/payments']),
        method=random.choice(['GET', 'POST', 'PUT']),
        session_id=f'sess_{random.randint(100000, 999999)}',
        email=f'user{random.randint(1, 1000)}@example.com',
        item_id=f'item_{random.randint(1, 500)}',
        quantity=random.randint(1, 100),
        function_name=random.choice(['processPayment', 'validateUser', 'updateInventory']),
        query_plan='index_scan',
        size=random.randint(100, 50000),
        active=random.randint(5, 20),
        idle=random.randint(10, 50),
        memory=random.randint(40, 75),
        attempt=random.randint(1, 3),
        field_name=random.choice(['email', 'password', 'amount', 'item_id']),
        retry_after=random.randint(30, 300),
        exception_type=random.choice(['ValueError', 'TypeError', 'KeyError'])
    )
    
    return {
        'timestamp': generate_timestamp_from_datetime(timestamp),
        'level': level,
        'service': service,
        'message': message,
        'is_anomaly': 0,
        'anomaly_type': None
    }


def generate_anomaly_spike(start_time: datetime, service: str, spike_size: int = 50) -> list[dict]:
    """
    Generate a spike of error logs (anomaly pattern).

    param start_time: Starting timestamp.
    param service: Service experiencing the spike.
    param spike_size: Number of errors in the spike.
    """
    logs = []
    anomaly_category = random.choice(['spike_error', 'cascade', 'resource'])
    template = random.choice(ANOMALY_TEMPLATES[anomaly_category])
    
    for i in range(spike_size):
        timestamp = start_time + timedelta(seconds=i * random.uniform(0.1, 2))
        
        message = template.format(
            max_conn=random.choice([50, 100, 200]),
            service=random.choice(SERVICES),
            attempts=random.randint(3, 10),
            host=f'db-{random.randint(1, 5)}.example.com',
            error_msg=random.choice(['Connection refused', 'Timeout', 'Network unreachable']),
            function_name=random.choice(['processPayment', 'validateUser', 'updateInventory']),
            line=random.randint(100, 500),
            table_name=random.choice(['users', 'orders', 'payments']),
            error_code=f'E{random.randint(1000, 9999)}',
            disk_space=random.randint(1, 5),
            memory=random.randint(90, 99),
            duration=random.randint(300, 600),
            queue_size=random.randint(10000, 50000),
            ip_address=f'{random.randint(1,255)}.{random.randint(1,255)}.{random.randint(1,255)}.{random.randint(1,255)}'
        )
        
        logs.append({
            'timestamp': generate_timestamp_from_datetime(timestamp),
            'level': random.choice(['ERROR', 'FATAL', 'WARN']),
            'service': service,
            'message': message,
            'is_anomaly': 1,
            'anomaly_type': f'{anomaly_category}_spike'
        })
    
    return logs


def generate_cascade_failure(start_time: datetime, affected_services: list[str], cascade_size: int = 30) -> list[dict]:
    """
    Generate cascade failure across multiple services (anomaly pattern).

    param start_time: Starting timestamp.
    param affected_services: Services affected by cascade.
    param cascade_size: Number of logs in cascade.
    """
    logs = []
    templates = ANOMALY_TEMPLATES['cascade']
    
    for i in range(cascade_size):
        timestamp = start_time + timedelta(seconds=i * random.uniform(1, 5))
        service = random.choice(affected_services)
        template = random.choice(templates)
        
        message = template.format(
            service=random.choice(SERVICES)
        )
        
        logs.append({
            'timestamp': generate_timestamp_from_datetime(timestamp),
            'level': random.choice(['ERROR', 'FATAL']),
            'service': service,
            'message': message,
            'is_anomaly': 1,
            'anomaly_type': 'cascade_failure'
        })
    
    return logs


def generate_new_pattern_anomaly(timestamp: datetime, service: str) -> dict:
    """
    Generate a new/unusual error pattern (anomaly).

    param timestamp: Log timestamp.
    param service: Service name.
    """
    template = random.choice(ANOMALY_TEMPLATES['new_pattern'])
    
    message = template.format(
        function_name=random.choice(['processPayment', 'validateUser', 'updateInventory']),
        line=random.randint(50, 500),
        table_name=random.choice(['users', 'orders', 'payments']),
        error_code=f'E{random.randint(1000, 9999)}'
    )
    
    return {
        'timestamp': generate_timestamp_from_datetime(timestamp),
        'level': random.choice(['ERROR', 'FATAL']),
        'service': service,
        'message': message,
        'is_anomaly': 1,
        'anomaly_type': 'new_pattern'
    }


def generate_security_anomaly(start_time: datetime, num_events: int = 20) -> list[dict]:
    """
    Generate security-related anomaly (multiple suspicious events).

    param start_time: Starting timestamp.
    param num_events: Number of security events.
    """
    logs = []
    ip = f'{random.randint(1,255)}.{random.randint(1,255)}.{random.randint(1,255)}.{random.randint(1,255)}'
    
    for i in range(num_events):
        timestamp = start_time + timedelta(seconds=i * random.uniform(0.5, 3))
        template = random.choice(ANOMALY_TEMPLATES['security'])
        
        message = template.format(
            ip_address=ip,
            attempts=i + 1
        )
        
        logs.append({
            'timestamp': generate_timestamp_from_datetime(timestamp),
            'level': 'WARN' if i < num_events - 5 else 'ERROR',
            'service': 'auth-service',
            'message': message,
            'is_anomaly': 1,
            'anomaly_type': 'security_breach'
        })
    
    return logs

## Generate Training Data

In [251]:
def generate_training_data(
    num_days: int = NUM_DAYS,
    logs_per_day: int = LOGS_PER_DAY,
    total_spikes: int = TOTAL_SPIKES,
    total_cascades: int = TOTAL_CASCADES,
    total_new_patterns: int = TOTAL_NEW_PATTERNS,
    total_security_incidents: int = TOTAL_SECURITY_INCIDENTS
) -> pd.DataFrame:
    """
    Generate realistic training data over multiple days with randomly distributed anomalies.

    param num_days: Number of days to generate data for.
    param logs_per_day: Average number of normal logs per day.
    param total_spikes: Total error spikes (randomly distributed).
    param total_cascades: Total cascade failures (randomly distributed).
    param total_new_patterns: Total new error patterns (randomly distributed).
    param total_security_incidents: Total security incidents (randomly distributed).
    """
    total_duration_minutes = num_days * 24 * 60
    num_normal = num_days * logs_per_day
    
    print("="*60)
    print("REALISTIC LOG DATA GENERATION - MULTI-DAY")
    print("="*60)
    print(f"\nGenerating logs over {num_days} days...")
    print(f"Normal logs (baseline): {num_normal:,}")
    print(f"Anomaly patterns (randomly distributed):")
    print(f"  - Error spikes: {total_spikes} total")
    print(f"  - Cascade failures: {total_cascades} total")
    print(f"  - New error patterns: {total_new_patterns} total")
    print(f"  - Security incidents: {total_security_incidents} total")
    print(f"\nNote: Anomalies spread randomly over 3-4 day periods (if sufficient days).")
    print()
    
    # Start time is num_days ago.
    start_time = datetime.now() - timedelta(days=num_days)
    logs = []
    
    # Generate normal baseline logs with realistic daily patterns.
    print("Generating baseline traffic with daily patterns...")
    for i in range(num_normal):
        # Calculate timestamp with realistic spacing and daily patterns.
        avg_interval = (total_duration_minutes * 60) / num_normal
        offset = i * avg_interval + random.uniform(-avg_interval * 0.3, avg_interval * 0.3)
        timestamp = start_time + timedelta(seconds=offset)
        
        # Add daily traffic patterns (lower traffic at night).
        hour = timestamp.hour
        if 2 <= hour <= 6:  # Low traffic at night.
            if random.random() < 0.5:  # Skip 50% of logs during night.
                continue
        elif 9 <= hour <= 17:  # Peak traffic during business hours.
            # Add extra logs during peak hours.
            if random.random() < 0.2:  # 20% chance of extra log.
                service = random.choice(SERVICES)
                logs.append(generate_normal_log(timestamp, service))
        
        service = random.choice(SERVICES)
        logs.append(generate_normal_log(timestamp, service))
        
        if (i + 1) % 30000 == 0:
            print(f"  Generated {i + 1:,} baseline logs...")
    
    print(f"✓ Baseline complete: {len(logs):,} logs (with daily patterns)\n")
    
    # Inject error spikes - cluster them in certain periods if enough days.
    print(f"Injecting {total_spikes} error spikes...")
    
    if num_days >= 4:
        # Cluster spikes in 2-3 active periods.
        print(f"(Clustered in 2-3 periods)")
        num_active_periods = random.randint(2, 3)
        spikes_per_period = [total_spikes // num_active_periods] * num_active_periods
        # Distribute remaining spikes.
        for i in range(total_spikes % num_active_periods):
            spikes_per_period[i] += 1
        
        spike_count = 0
        for period_idx, num_spikes_in_period in enumerate(spikes_per_period):
            # Each period spans 3-4 days.
            period_start_day = random.randint(0, max(0, num_days - 4))
            period_duration_days = min(random.randint(3, 4), num_days - period_start_day)
            
            print(f"\n  Active Period {period_idx + 1}: Days {period_start_day + 1}-{period_start_day + period_duration_days} ({num_spikes_in_period} spikes)")
            
            for i in range(num_spikes_in_period):
                spike_day = period_start_day + random.randint(0, max(0, period_duration_days - 1))
                spike_hour = random.randint(0, 23)
                spike_minute = random.randint(0, 59)
                spike_time = start_time + timedelta(days=spike_day, hours=spike_hour, minutes=spike_minute)
                
                service = random.choice(SERVICES)
                spike_size = random.randint(30, 100)
                spike_logs = generate_anomaly_spike(spike_time, service, spike_size)
                logs.extend(spike_logs)
                spike_count += 1
                print(f"    Spike {spike_count}: {spike_size} errors on Day {spike_day + 1} at {spike_time.strftime('%Y-%m-%d %H:%M')} from {service}")
    else:
        # For short durations, distribute randomly.
        print(f"(Randomly distributed)")
        for i in range(total_spikes):
            spike_day = random.randint(0, num_days - 1)
            spike_hour = random.randint(0, 23)
            spike_minute = random.randint(0, 59)
            spike_time = start_time + timedelta(days=spike_day, hours=spike_hour, minutes=spike_minute)
            
            service = random.choice(SERVICES)
            spike_size = random.randint(30, 100)
            spike_logs = generate_anomaly_spike(spike_time, service, spike_size)
            logs.extend(spike_logs)
            print(f"  Spike {i+1}: {spike_size} errors on Day {spike_day + 1} at {spike_time.strftime('%Y-%m-%d %H:%M')} from {service}")
    
    print(f"\n✓ Spikes injected\n")
    
    # Inject cascade failures - spread randomly but may cluster.
    print(f"Injecting {total_cascades} cascade failures...")
    for i in range(total_cascades):
        cascade_day = random.randint(0, num_days - 1)
        cascade_hour = random.randint(0, 23)
        cascade_minute = random.randint(0, 59)
        cascade_time = start_time + timedelta(days=cascade_day, hours=cascade_hour, minutes=cascade_minute)
        
        affected = random.sample(SERVICES, k=random.randint(3, 5))
        cascade_size = random.randint(40, 80)
        cascade_logs = generate_cascade_failure(cascade_time, affected, cascade_size)
        logs.extend(cascade_logs)
        print(f"  Cascade {i+1}: {cascade_size} errors on Day {cascade_day+1} at {cascade_time.strftime('%Y-%m-%d %H:%M')} across {len(affected)} services")
    
    print(f"✓ Cascades injected\n")
    
    # Inject new/unusual patterns - spread randomly.
    print(f"Injecting {total_new_patterns} new error patterns (spread randomly)...")
    for i in range(total_new_patterns):
        pattern_day = random.randint(0, num_days - 1)
        pattern_time = start_time + timedelta(days=pattern_day, hours=random.randint(0, 23), minutes=random.randint(0, 59))
        service = random.choice(SERVICES)
        logs.append(generate_new_pattern_anomaly(pattern_time, service))
    
    print(f"✓ New patterns injected\n")
    
    # Inject security incidents - usually clustered.
    print(f"Injecting {total_security_incidents} security incidents...")
    for i in range(total_security_incidents):
        incident_day = random.randint(0, num_days - 1)
        incident_time = start_time + timedelta(days=incident_day, hours=random.randint(0, 23), minutes=random.randint(0, 59))
        incident_logs = generate_security_anomaly(incident_time, num_events=random.randint(15, 30))
        logs.extend(incident_logs)
        print(f"  Security incident {i+1} on Day {incident_day+1} at {incident_time.strftime('%Y-%m-%d %H:%M')}")
    
    print(f"✓ Security incidents injected\n")
    
    # Create DataFrame and sort by timestamp.
    print("Creating dataset...")
    df = pd.DataFrame(logs)
    df['timestamp_dt'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp_dt').reset_index(drop=True)
    df = df.drop('timestamp_dt', axis=1)
    
    # Calculate statistics.
    total_logs = len(df)
    anomalous_logs = df['is_anomaly'].sum()
    anomaly_rate = (anomalous_logs / total_logs) * 100
    
    # Show per-day statistics.
    df['date'] = pd.to_datetime(df['timestamp']).dt.date
    daily_stats = df.groupby('date').agg({
        'is_anomaly': ['count', 'sum']
    }).reset_index()
    daily_stats.columns = ['Date', 'Total Logs', 'Anomalies']
    daily_stats['Anomaly %'] = (daily_stats['Anomalies'] / daily_stats['Total Logs'] * 100).round(2)
    
    print(f"\n{'='*60}")
    print("DATASET GENERATED SUCCESSFULLY")
    print(f"{'='*60}")
    print(f"Total logs: {total_logs:,}")
    print(f"Date range: {num_days} days")
    print(f"Normal logs: {total_logs - anomalous_logs:,} ({100 - anomaly_rate:.2f}%)")
    print(f"Anomalous logs: {anomalous_logs:,} ({anomaly_rate:.2f}%)")
    print(f"\nPer-Day Breakdown:")
    print(daily_stats.to_string(index=False))
    print(f"\nKey insight: Not all ERROR/FATAL logs are anomalies!")
    print(f"  - Baseline contains expected errors")
    print(f"  - Anomalies are identified by PATTERNS (spikes, cascades, new errors)")
    if num_days >= 4:
        print(f"  - Some days have more anomalies, some have fewer (realistic)")
    print()
    
    df = df.drop('date', axis=1)
    return df

## Generate and Inspect Data

In [252]:
# Generate the training data with realistic anomaly patterns over 9 days.
df = generate_training_data()

print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nDate Range: {df['timestamp'].min()} to {df['timestamp'].max()}")

REALISTIC LOG DATA GENERATION - MULTI-DAY

Generating logs over 9 days...
Normal logs (baseline): 135,000
Anomaly patterns (randomly distributed):
  - Error spikes: 8 total
  - Cascade failures: 3 total
  - New error patterns: 40 total
  - Security incidents: 2 total

Note: Anomalies spread randomly over 3-4 day periods (if sufficient days).

Generating baseline traffic with daily patterns...
  Generated 30,000 baseline logs...
  Generated 60,000 baseline logs...
  Generated 90,000 baseline logs...
  Generated 120,000 baseline logs...
✓ Baseline complete: 131,164 logs (with daily patterns)

Injecting 8 error spikes...
(Clustered in 2-3 periods)

  Active Period 1: Days 5-7 (3 spikes)
    Spike 1: 94 errors on Day 6 at 2025-12-23 13:29 from api-gateway
    Spike 2: 59 errors on Day 6 at 2025-12-23 06:29 from inventory-service
    Spike 3: 69 errors on Day 5 at 2025-12-21 23:52 from inventory-service

  Active Period 2: Days 1-3 (3 spikes)
    Spike 4: 63 errors on Day 2 at 2025-12-19 02

In [253]:
# Display first few rows.
print("\n" + "="*60)
print("Sample Normal Logs:")
print("="*60)
df[df['is_anomaly'] == 0].head(10)


Sample Normal Logs:


Unnamed: 0,timestamp,level,service,message,is_anomaly,anomaly_type
0,2025-12-17T23:44:21.030517Z,DEBUG,auth-service,Cache miss for key: cache_70,0,
1,2025-12-17T23:44:26.617160Z,WARN,api-gateway,Slow query detected - duration: 184ms,0,
2,2025-12-17T23:44:31.353882Z,INFO,notification-service,"Database query completed - rows: 297, duration...",0,
3,2025-12-17T23:44:38.295143Z,DEBUG,user-service,Cache miss for key: cache_89,0,
4,2025-12-17T23:44:44.889385Z,INFO,order-service,User session created - session_id: sess_665158,0,
5,2025-12-17T23:44:50.596630Z,INFO,payment-service,API request received - endpoint: /api/payments...,0,
6,2025-12-17T23:44:53.391874Z,WARN,notification-service,Rate limit approaching for user 9201,0,
7,2025-12-17T23:45:02.177521Z,DEBUG,api-gateway,Cache miss for key: cache_94,0,
8,2025-12-17T23:45:07.510855Z,INFO,notification-service,Cache hit for key: cache_58,0,
9,2025-12-17T23:45:13.789790Z,INFO,api-gateway,Email notification sent to user249@example.com,0,


In [254]:
# Display anomalous logs.
print("\n" + "="*60)
print("Sample Anomalous Logs:")
print("="*60)
df[df['is_anomaly'] == 1].head(10)


Sample Anomalous Logs:


Unnamed: 0,timestamp,level,service,message,is_anomaly,anomaly_type
1777,2025-12-18T03:06:20.548658Z,FATAL,notification-service,StackOverflowError in updateInventory,1,new_pattern
2172,2025-12-18T04:21:20.548658Z,ERROR,payment-service,StackOverflowError in validateUser,1,new_pattern
4696,2025-12-18T09:35:20.548658Z,WARN,api-gateway,Circuit breaker opened for api-gateway - failu...,1,cascade_spike
4697,2025-12-18T09:35:21.202086Z,WARN,api-gateway,Circuit breaker opened for notification-servic...,1,cascade_spike
4698,2025-12-18T09:35:21.974697Z,ERROR,api-gateway,Circuit breaker opened for user-service - fail...,1,cascade_spike
4699,2025-12-18T09:35:22.411360Z,ERROR,api-gateway,Circuit breaker opened for inventory-service -...,1,cascade_spike
4700,2025-12-18T09:35:23.290139Z,WARN,api-gateway,Circuit breaker opened for payment-service - f...,1,cascade_spike
4701,2025-12-18T09:35:24.252878Z,ERROR,api-gateway,Circuit breaker opened for order-service - fai...,1,cascade_spike
4704,2025-12-18T09:35:25.225362Z,FATAL,api-gateway,Circuit breaker opened for inventory-service -...,1,cascade_spike
4705,2025-12-18T09:35:25.358028Z,ERROR,api-gateway,Circuit breaker opened for notification-servic...,1,cascade_spike


## Data Statistics

In [255]:
# Detailed statistics.
print("\n" + "="*60)
print("DETAILED DATASET STATISTICS")
print("="*60 + "\n")

print("Class Distribution:")
print(df['is_anomaly'].value_counts())
print(f"\nAnomaly Rate: {(df['is_anomaly'].sum() / len(df)) * 100:.2f}%\n")

print("\nLog Level Distribution:")
print(df['level'].value_counts())
print()

print("\nLog Level Distribution by Anomaly Status:")
level_anomaly = pd.crosstab(df['level'], df['is_anomaly'], margins=True)
level_anomaly.columns = ['Normal', 'Anomaly', 'Total']
print(level_anomaly)

print("\n" + "-"*60)
print("KEY INSIGHT: Expected Errors in Normal Logs")
print("-"*60)
normal_errors = df[(df['is_anomaly'] == 0) & (df['level'].isin(['ERROR', 'FATAL']))]
print(f"Expected ERROR/FATAL logs (NOT anomalies): {len(normal_errors):,}")
print(f"These are business errors like:")
print("  - Invalid passwords")
print("  - Insufficient funds")
print("  - Missing fields")
print("  - Session expired")
print()

print("\nAnomaly Type Distribution:")
if 'anomaly_type' in df.columns:
    anomaly_dist = df[df['is_anomaly'] == 1]['anomaly_type'].value_counts()
    print(anomaly_dist)
    print()

print("\nService Distribution:")
print(df['service'].value_counts())


DETAILED DATASET STATISTICS

Class Distribution:
is_anomaly
0    131164
1       771
Name: count, dtype: int64

Anomaly Rate: 0.58%


Log Level Distribution:
level
INFO     85358
DEBUG    25933
WARN     13378
ERROR     6295
FATAL      971
Name: count, dtype: int64


Log Level Distribution by Anomaly Status:
       Normal  Anomaly   Total
level                         
DEBUG   25933        0   25933
ERROR    6002      293    6295
FATAL     696      275     971
INFO    85358        0   85358
WARN    13175      203   13378
All    131164      771  131935

------------------------------------------------------------
KEY INSIGHT: Expected Errors in Normal Logs
------------------------------------------------------------
Expected ERROR/FATAL logs (NOT anomalies): 6,698
These are business errors like:
  - Invalid passwords
  - Insufficient funds
  - Missing fields
  - Session expired


Anomaly Type Distribution:
anomaly_type
cascade_spike        195
cascade_failure      179
spike_error_spike  

## Save Training Data

In [256]:
# Create data directory if it doesn't exist.
import os

data_dir = '../data'
os.makedirs(data_dir, exist_ok=True)

# # Save to CSV.
# output_file = os.path.join(data_dir, 'training_logs.csv')
# df.to_csv(output_file, index=False)
# print(f"\n✓ Training data saved to: {output_file}")
# print(f"  File size: {os.path.getsize(output_file) / (1024*1024):.2f} MB")

# Try to save as Parquet for better performance (optional).
try:
    output_parquet = os.path.join(data_dir, 'training_logs.parquet')
    df.to_parquet(output_parquet, index=False, engine='fastparquet')
    print(f"\n✓ Training data saved to: {output_parquet}")
    print(f"  File size: {os.path.getsize(output_parquet) / (1024*1024):.2f} MB")
except Exception as e:
    print(f"\n⚠ Parquet save skipped (pyarrow/fastparquet not available)")
    print(f"  CSV format is sufficient for training.")


✓ Training data saved to: ../data/training_logs.parquet
  File size: 3.23 MB


## Generate Streaming Test Data (Smaller Sample)

Generate a smaller dataset for testing the streaming pipeline.

In [257]:
# Generate smaller test set for streaming validation (2 days).
print("\n" + "="*60)
print("GENERATING TEST DATASET (2 DAYS)")
print("="*60 + "\n")

test_df = generate_training_data(
    num_days=2,
    logs_per_day=5000,
    total_spikes=4,
    total_cascades=1,
    total_new_patterns=10,
    total_security_incidents=1
)

# Save test data.
test_output = os.path.join(data_dir, 'test_logs.csv')
test_df.to_csv(test_output, index=False)
print(f"\n✓ Test data saved to: {test_output}")
print(f"  Total logs: {len(test_df):,}")
print(f"  Anomaly rate: {(test_df['is_anomaly'].sum() / len(test_df)) * 100:.2f}%")


GENERATING TEST DATASET (2 DAYS)

REALISTIC LOG DATA GENERATION - MULTI-DAY

Generating logs over 2 days...
Normal logs (baseline): 10,000
Anomaly patterns (randomly distributed):
  - Error spikes: 4 total
  - Cascade failures: 1 total
  - New error patterns: 10 total
  - Security incidents: 1 total

Note: Anomalies spread randomly over 3-4 day periods (if sufficient days).

Generating baseline traffic with daily patterns...
✓ Baseline complete: 9,705 logs (with daily patterns)

Injecting 4 error spikes...
(Randomly distributed)
  Spike 1: 32 errors on Day 1 at 2025-12-25 19:46 from api-gateway
  Spike 2: 67 errors on Day 2 at 2025-12-26 12:37 from order-service
  Spike 3: 80 errors on Day 2 at 2025-12-26 00:04 from notification-service
  Spike 4: 68 errors on Day 2 at 2025-12-26 14:37 from api-gateway

✓ Spikes injected

Injecting 1 cascade failures...
  Cascade 1: 55 errors on Day 1 at 2025-12-25 12:52 across 5 services
✓ Cascades injected

Injecting 10 new error patterns (spread ra