In [14]:
import pandas as pd
import numpy as np
from google.colab import files
from datetime import datetime, timedelta

# --- 1. Define Simulation Parameters ---
start_date = datetime(2025, 6, 1)
n_days = 15
routers = ['Router_A', 'Router_B', 'Router_C']
applications = ['Email', 'Video Streaming', 'File Transfer', 'Web Browsing', 'VoIP']
users = ['User1', 'User2', 'User3', 'User4']

# --- 2. Generate Base Hourly Router Logs ---
print("Generating new realistic router logs...")
time_index = pd.to_datetime([start_date + timedelta(hours=i) for i in range(n_days * 24)])
new_log_data = []

for router in routers:
    for ts in time_index:
        hour = ts.hour
        day_of_week = ts.dayofweek
        is_peak_hours = 9 <= hour < 17 and day_of_week < 5
        is_evening_hours = 18 <= hour < 23

        base_traffic = np.random.uniform(10, 30)
        if is_peak_hours:
            base_traffic *= np.random.uniform(2.5, 4.0)
        elif is_evening_hours:
            base_traffic *= np.random.uniform(1.5, 2.5)

        traffic_volume = base_traffic + np.random.normal(0, 5)
        latency = 15 + (traffic_volume / 10) + np.random.normal(0, 5)
        if is_peak_hours:
            latency += np.random.uniform(5, 15)

        bandwidth_used = traffic_volume * np.random.uniform(0.95, 1.1)

        new_log_data.append({
            'Timestamp': ts, 'Device Name': router,
            'Traffic Volume (MB/s)': round(max(5, traffic_volume), 2),
            'Latency (ms)': round(max(10, latency), 2),
            'Bandwidth Allocated (MB/s)': 100,
            'Bandwidth Used (MB/s)': round(max(5, bandwidth_used), 2)
        })

new_routers_df = pd.DataFrame(new_log_data)
new_routers_df['Date'] = pd.to_datetime(new_routers_df['Timestamp'].dt.date)

# --- 3. Generate Daily Summary Data (Corrected) ---
print("Generating daily application, user, and event data...")
date_range = pd.to_datetime([start_date + timedelta(days=i) for i in range(n_days)])

# App Usage (Corrected)
app_data = []
for date in date_range:
    for app in applications:
        avg_traffic = np.random.uniform(20, 150)
        peak_traffic = avg_traffic * np.random.uniform(1.5, 4.0)
        app_data.append({'Date': date, 'Application': app,
                         'Average_Traffic_MB': round(avg_traffic, 2),
                         'Peak_Traffic_MB': round(peak_traffic, 2)})
daily_app_summary = pd.DataFrame(app_data).groupby('Date').agg(
    total_avg_app_traffic=('Average_Traffic_MB', 'sum'),
    total_peak_app_traffic=('Peak_Traffic_MB', 'sum')
).reset_index()

# User Activity (Corrected)
user_data = []
for date in date_range:
    for user in users:
        login_count = np.random.randint(0, 6)
        peak_usage = np.random.uniform(5, 50) if login_count > 0 else 0
        user_data.append({'Date': date, 'User': user,
                           'Login_Count': login_count,
                           'Peak_Usage_MB': round(peak_usage, 2)})
daily_user_summary = pd.DataFrame(user_data).groupby('Date').agg(
    total_logins=('Login_Count', 'sum'),
    total_peak_user_usage=('Peak_Usage_MB', 'sum')
).reset_index()

# Config History & External Factors
config_data = [{'Date': start_date + timedelta(days=3), 'Device Name': 'Router_B', 'Num_Config_Changes': 1},
               {'Date': start_date + timedelta(days=8), 'Device Name': 'Router_A', 'Num_Config_Changes': 1}]
daily_config_summary = pd.DataFrame(config_data)
daily_config_summary['Date'] = pd.to_datetime(daily_config_summary['Date'])

external_data = [{'Date': start_date + timedelta(days=5), 'Event': 'External Outage', 'Impact': 'High'},
                 {'Date': start_date + timedelta(days=10), 'Event': 'Scheduled Maintenance', 'Impact': 'Medium'}]
external_factors_df = pd.DataFrame(external_data)
external_factors_df['Date'] = pd.to_datetime(external_factors_df['Date'])

# --- 4. Merge All Data and Create Target Flag ---
print("Merging all datasets...")
merged_df = pd.merge(new_routers_df, daily_app_summary, on='Date', how='left')
merged_df = pd.merge(merged_df, daily_user_summary, on='Date', how='left')
merged_df = pd.merge(merged_df, external_factors_df, on='Date', how='left')
merged_df = pd.merge(merged_df, daily_config_summary, on=['Date', 'Device Name'], how='left')
merged_df.fillna({'Event': 'None', 'Impact': 'None', 'Num_Config_Changes': 0}, inplace=True)

merged_df.loc[merged_df['Event'] == 'External Outage', 'Latency (ms)'] *= 2.5
merged_df.loc[merged_df['Event'] == 'Scheduled Maintenance', 'Traffic Volume (MB/s)'] *= 0.1

def create_new_flag(row):
    utilization = row['Bandwidth Used (MB/s)'] / row['Bandwidth Allocated (MB/s)']
    if utilization > 0.85 or (utilization > 0.7 and row['Latency (ms)'] > 60) or row['Event'] == 'External Outage':
        return 1
    return 0
merged_df['New_Flag'] = merged_df.apply(create_new_flag, axis=1)

# --- 5. Final Sorting and Saving ---
final_df = merged_df.sort_values(by=['Timestamp', 'Device Name']).reset_index(drop=True)
file_name = 'new_realistic_dataset_v2.csv'
final_df.to_csv(file_name, index=False)

print(f"\nSuccessfully generated the corrected dataset '{file_name}'!")
print("Here's a preview including the previously missed columns:")
print(final_df[['Timestamp', 'Device Name', 'total_avg_app_traffic', 'total_peak_user_usage', 'New_Flag']].head(10))

print("\nStarting download...")
files.download(file_name)


Generating new realistic router logs...
Generating daily application, user, and event data...
Merging all datasets...

Successfully generated the corrected dataset 'new_realistic_dataset_v2.csv'!
Here's a preview including the previously missed columns:
            Timestamp Device Name  total_avg_app_traffic  \
0 2025-06-01 00:00:00    Router_A                 556.42   
1 2025-06-01 00:00:00    Router_B                 556.42   
2 2025-06-01 00:00:00    Router_C                 556.42   
3 2025-06-01 01:00:00    Router_A                 556.42   
4 2025-06-01 01:00:00    Router_B                 556.42   
5 2025-06-01 01:00:00    Router_C                 556.42   
6 2025-06-01 02:00:00    Router_A                 556.42   
7 2025-06-01 02:00:00    Router_B                 556.42   
8 2025-06-01 02:00:00    Router_C                 556.42   
9 2025-06-01 03:00:00    Router_A                 556.42   

   total_peak_user_usage  New_Flag  
0                 138.86         0  
1          

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>