# Training Anomaly Detection Model for PetroStream
This notebook loads the 3W dataset via KaggleHub and trains an Isolation Forest model.

In [None]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.ensemble import IsolationForest
import joblib
import os
import glob

In [None]:
# Focus on key pressure and temperature sensors
FEATURES = ["P-PDG", "P-TPT", "T-TPT", "P-MON-CKP", "T-JUS-CKP"]

In [None]:
def load_data(sample_frac: float = 0.1):
    """
    Downloads the 3W dataset using kagglehub, extracts relevant features, and samples for training.
    """
    print("Downloading/Locating 3W dataset using kagglehub...")
    data_dir = kagglehub.dataset_download("afrniomelo/3w-dataset")
    print(f"Loading files from {data_dir}...")
    
    # Find all simulated parquet files in subdirectories
    files = glob.glob(os.path.join(data_dir, "**", "SIMULATED_*.parquet"), recursive=True)
    
    # We will just take the first 5 files to keep training extremely fast on the Mac M4
    # and fit within memory easily while still proving the pipeline works
    selected_files = files[:5] 
    
    dfs = []
    for file in selected_files:
        try:
            df = pd.read_parquet(file)
            # Only keep the features we want to train on
            df = df[FEATURES].copy()
            # Drop any rows that have NaN values in our sensor columns
            df.dropna(inplace=True)
            dfs.append(df)
            print(f"Loaded {os.path.basename(file)} with {len(df)} valid records.")
        except Exception as e:
            print(f"Failed to load {file}: {e}")
            
    if not dfs:
        raise ValueError("No data loaded. Check data path.")
        
    combined_df = pd.concat(dfs)
    
    # Sample down to make training instantaneous
    sampled_df = combined_df.sample(frac=sample_frac, random_state=42)
    print(f"Total training samples: {len(sampled_df)}")
    
    return sampled_df

In [None]:
def train_model(df: pd.DataFrame, model_path: str):
    """
    Trains the Isolation Forest anomaly detection model.
    """
    print("Training Isolation Forest model...")
    # Isolation Forest is great for detecting rare events (anomalies) in sensor data
    # contamination=0.01 means we assume ~1% of the data is actually anomalous
    model = IsolationForest(
        n_estimators=100,
        max_samples="auto",
        contamination=0.01,
        random_state=42,
        n_jobs=-1 # Use all cores on the M4
    )
    
    model.fit(df)
    print("Training complete.")
    
    # Save the model
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    joblib.dump(model, model_path)
    print(f"Model saved to {model_path}")

In [None]:
MODEL_PATH = "model.joblib"

# 1. Download/Load data via kagglehub
training_data = load_data()

# 2. Train and save model
train_model(training_data, MODEL_PATH)

print("\n--- Next Steps ---")
print("1. To deploy, the "model.joblib" file should be uploaded to the S3 raw-data bucket.")
print("2. The inference Lambda/ECS container will download it to make predictions.")