In [1]:
import pandas as pd
import numpy as np
import joblib
import logging
from collections import deque

# We need the feature extractor function
try:
    from window_feature import extract_window_features
except ImportError:
    print("FATAL: 'window_feature.py' not found. Please make sure it is in the same folder.")
    exit()

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration ---
CLEAN_DATA_CSV = "my_master_dataset_RELABELED.csv" # From our previous relabeling step
OUTPUT_WINDOWED_CSV = "windowed_features_relabeled.csv"

# These must match your 'train_final_window_model.py' script
WINDOW_SIZE = 50
WINDOW_STRIDE = 5 # Use a stride to make this file smaller and faster
FEATURE_COLS = [
    'att_roll', 'att_pitch', 'att_yaw',
    'pos_lat', 'pos_lon', 'pos_alt_rel',
    'pos_vx', 'pos_vy', 'pos_vz',
    'nav_roll', 'nav_pitch', 'nav_alt_error',
    'sys_voltage_battery', 'sys_load',
    'vib_x', 'vib_y', 'vib_z'
]
# --- End Configuration ---

def create_windows(df):
    windows = []
    labels = []
    
    logger.info("Starting window creation...")
    for fid, g in df.groupby("flight_id"):
        arr = g[FEATURE_COLS].values
        lbl = g["label"].values
        n = len(arr)

        if n < WINDOW_SIZE:
            continue

        for start in range(0, n - WINDOW_SIZE + 1, WINDOW_STRIDE):
            w = arr[start:start+WINDOW_SIZE]
            wdf = pd.DataFrame(w, columns=FEATURE_COLS).replace([np.inf, -np.inf], np.nan)
            
            # Use the robust, NaN-safe feature extractor
            feats = extract_window_features(wdf) 
            windows.append(feats)

            # Window label: if ANY attack in window, label=1
            labels.append(int(np.any(lbl[start:start+WINDOW_SIZE] == 1)))
    
    logger.info("Window creation finished.")
    return pd.DataFrame(windows), np.array(labels)

def main():
    logger.info(f"Loading CLEAN, relabeled dataset from: {CLEAN_DATA_CSV}")
    try:
        df = pd.read_csv(CLEAN_DATA_CSV)
    except FileNotFoundError:
        logger.error(f"FATAL: File not found: {CLEAN_DATA_CSV}")
        logger.error("Please run 'relabel_data.py' first.")
        return

    # Create the windowed features and corresponding labels
    X_windowed, y_windowed = create_windows(df)
    
    logger.info(f"Created {len(X_windowed)} total windows.")
    
    # Combine features and labels into one DataFrame
    X_windowed['label'] = y_windowed
    
    # Save to our new master file
    X_windowed.to_csv(OUTPUT_WINDOWED_CSV, index=False)
    
    logger.info(f"✅ Success! Saved windowed features and labels to '{OUTPUT_WINDOWED_CSV}'")

if __name__ == "__main__":
    main()

2025-11-16 21:33:44,722 - INFO - Loading CLEAN, relabeled dataset from: my_master_dataset_RELABELED.csv
2025-11-16 21:33:44,837 - INFO - Starting window creation...
2025-11-16 21:34:12,694 - INFO - Window creation finished.
2025-11-16 21:34:12,955 - INFO - Created 7282 total windows.
2025-11-16 21:34:13,957 - INFO - ✅ Success! Saved windowed features and labels to 'windowed_features_relabeled.csv'


In [None]:
import pandas as pd
import logging
import joblib
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration ---
# This is the file you just created in Step 1
WINDOWED_DATA_CSV = "windowed_features_relabeled.csv" 
SYNTHETIC_DATA_OUTPUT_CSV = "synthetic_attack_data.csv"
NUM_SYNTHETIC_ROWS = 10000 # Number of new attack rows to create
# --- End Configuration ---

def generate_data():
    logger.info(f"Loading WINDOWED, relabeled dataset from: {WINDOWED_DATA_CSV}")
    try:
        windowed_df = pd.read_csv(WINDOWED_DATA_CSV)
    except FileNotFoundError:
        logger.error(f"FATAL: File not found: {WINDOWED_DATA_CSV}")
        logger.error("Please run 'create_windowed_dataset.py' first.")
        return
        
    # 1. Isolate *real* attack data
    real_attacks_df = windowed_df[windowed_df['label'] == 1].copy()
    
    # We drop the label for training the GAN
    real_attacks_df = real_attacks_df.drop(columns=['label'])
    
    logger.info(f"Isolated {len(real_attacks_df)} real attack windows for training the GAN.")

    if len(real_attacks_df) == 0:
        logger.error("FATAL: No attack data (label=1) found in the windowed file.")
        logger.error("Please check your 'relabel_data.py' script and run it again.")
        return

    # 2. Create metadata and train the GAN
    logger.info("Detecting metadata for the GAN...")
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=real_attacks_df)

    synthesizer = CTGANSynthesizer(metadata, epochs=300, verbose=True)
    
    logger.info("--- Starting GAN Training (This may take several minutes) ---")
    synthesizer.fit(real_attacks_df)
    logger.info("--- GAN Training Complete ---")

    # 3. Generate synthetic data
    logger.info(f"Generating {NUM_SYNTHETIC_ROWS} new synthetic attack rows...")
    synthetic_data = synthesizer.sample(num_rows=NUM_SYNTHETIC_ROWS)
    
    # 4. Save the synthetic data
    # Add the 'label' column back, hard-coded to 1
    synthetic_data['label'] = 1
    
    synthetic_data.to_csv(SYNTHETIC_DATA_OUTPUT_CSV, index=False)
    logger.info(f"✅ Success! Saved {NUM_SYNTHETIC_ROWS} synthetic attacks to '{SYNTHETIC_DATA_OUTPUT_CSV}'")
    
    logger.info("\n--- Next Step ---")
    logger.info("Run 'train_with_gan_data.py' to train your final, powerful model!")


if __name__ == "__main__":
    # This check is crucial for a real project
    try:
        import sdv
    except ImportError:
        logger.error("FATAL: 'sdv' library not found. Please run 'pip install sdv'")
    else:
        generate_data()