In [1]:
# Phase 5: Feature Scaling and Isolation Forest Training
# This script applies Standard Scaling to the feature matrix and then trains the 
# Isolation Forest model to generate anomaly scores.

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Set display options for better data visibility
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 30)

# --- Configuration ---
# The Isolation Forest contamination parameter determines the expected proportion of 
# outliers in the dataset. This is a hyperparameter and often requires tuning.
# We'll use a conservative 1% (0.01) as a starting point.
CONTAMINATION_RATE = 0.01 

# --- Simulated Data Loading (Mimics the output of Phase 4.1) ---

def simulate_data_load_after_encoding():
    """
    Creates a dummy DataFrame mimicking the fully preprocessed, numeric feature matrix (X)
    ready for scaling, including delta features and frequency-encoded columns.
    """
    print("Simulating load of the Numeric Feature Matrix (X)...")

    data = {
        # Time Delta Features (Numeric, large range)
        'Delta_MFTM_vs_M': [0.0, 0.0, 0.0, 0.0, 0.0, 100.0, -50.0],
        'Delta_M_vs_C': [127552468.0, 127551452.0, 32083971.0, 0.0, 0.0, -1000.0, 10.0],
        'Delta_C_vs_A': [-127552468.0, -127551452.0, -32083971.0, 0.0, 0.0, 1000.0, -10.0],
        'Delta_Event_vs_M': [-127552468.0, -127551452.0, -32083971.0, 0.0, 0.0, 50.0, 10.0],
        'Delta_Event_vs_MFTM': [-127552468.0, -127551452.0, -32083971.0, 0.0, 0.0, 50.0, 10.0],
        'Delta_Event_vs_C': [0.0, 0.0, 0.0, 0.0, 0.0, -100.0, 0.0],
        
        # Imputation Flags (Binary)
        'eventtime_imputed_by_creationtime': [0, 0, 0, 1, 0, 0, 1],
        'eventtime_imputed_by_modifiedtime': [0, 0, 0, 0, 0, 1, 0],
        'eventtime_imputed_by_mftmodifiedtime': [0, 0, 0, 0, 0, 0, 0],
        
        # Other Numeric Features (VCN, etc.)
        'targetvcn': [100, 10, 50, 100, 100, 1, 1],
        'clusterindex': [1, 6, 2, 2, 2, 10, 10],
        'missingfullpathflaglsn': [0, 0, 1, 0, 0, 0, 0],
        'missingfullpathflagusn': [0, 0, 0, 0, 0, 0, 1],
        
        # Frequency Encoded Features (Float, 0.0 to 1.0)
        'eventdetail_freq_encoded': [0.4, 0.4, 0.4, 0.1, 0.1, 0.0, 0.0],
        'source_freq_encoded': [0.6, 0.6, 0.6, 0.4, 0.4, 0.4, 0.4],
        'fileattribute_freq_encoded': [0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1],
        
        # NOTE: We keep LSN/USN for joining back later, but won't scale them. 
        # In a real scenario, these would be saved separately before scaling.
        'lsn_key': [1, 2, 3, 4, 5, 6, 7],
    }
    
    df = pd.DataFrame(data)
    
    # Identify the actual features (X) and separate the key
    X = df.drop(columns=['lsn_key'])
    keys = df['lsn_key']
    
    print(f"Feature matrix shape loaded: {X.shape}")
    return X, keys

# --- Main Execution ---

def run_model_training():
    """Main function to execute scaling and Isolation Forest training."""
    print("\n--- Phase 5.1: Feature Scaling and Isolation Forest Training ---")
    
    # 1. Load the features and key column
    X, keys = simulate_data_load_after_encoding()

    # 2. Feature Scaling
    print("\n2. Applying Standard Scaling...")
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Fit the scaler to the data and transform it
    X_scaled = scaler.fit_transform(X)
    
    # Convert back to a DataFrame for easier inspection
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    print("  Scaling complete. Sample of scaled data (mean should be close to 0):")
    print(X_scaled_df.head(2))

    # 3. Isolation Forest Training
    print(f"\n3. Training Isolation Forest with contamination rate: {CONTAMINATION_RATE}...")
    
    # Initialize the Isolation Forest model
    # random_state ensures reproducibility
    model = IsolationForest(
        n_estimators=100, 
        contamination=CONTAMINATION_RATE, 
        random_state=42,
        max_samples='auto',
        n_jobs=-1 # Use all available cores for faster training
    )
    
    # Fit the model to the scaled data
    model.fit(X_scaled)
    print("  Isolation Forest training complete.")

    # 4. Generate Anomaly Scores and Predictions
    # The decision_function returns the anomaly score (lower is more anomalous/outlier)
    X['anomaly_score'] = model.decision_function(X_scaled)
    
    # Predict the classification (-1 for anomaly, 1 for inlier)
    # The threshold is determined by the 'contamination' parameter
    X['anomaly_label'] = model.predict(X_scaled)
    
    # Merge the key back for identification
    X['lsn_key'] = keys
    
    # 5. Display Results
    print("\n4. Results Summary:")
    anomalies_detected = (X['anomaly_label'] == -1).sum()
    print(f"  Total records analyzed: {len(X):,}")
    print(f"  Records classified as anomalies (-1): {anomalies_detected:,}")
    print(f"  Anomaly detection percentage: {anomalies_detected / len(X) * 100:.2f}%")
    
    print("\nTop Anomalous Records (Lowest Anomaly Score):")
    # Sort by score (ascending) to see the most anomalous records
    top_anomalies = X.sort_values(by='anomaly_score', ascending=True).head(5)
    print(top_anomalies[['lsn_key', 'anomaly_score', 'anomaly_label', 'Delta_M_vs_C', 'eventdetail_freq_encoded']])
    
    return X

if __name__ == '__main__':
    final_results_df = run_model_training()




--- Phase 5.1: Feature Scaling and Isolation Forest Training ---
Simulating load of the Numeric Feature Matrix (X)...
Feature matrix shape loaded: (7, 16)

2. Applying Standard Scaling...
  Scaling complete. Sample of scaled data (mean should be close to 0):
   Delta_MFTM_vs_M  Delta_M_vs_C  Delta_C_vs_A  Delta_Event_vs_M  \
0        -0.171499      1.550976     -1.550976         -1.550977   
1        -0.171499      1.550958     -1.550958         -1.550958   

   Delta_Event_vs_MFTM  Delta_Event_vs_C  eventtime_imputed_by_creationtime  \
0            -1.550977          0.408248                          -0.632456   
1            -1.550958          0.408248                          -0.632456   

   eventtime_imputed_by_modifiedtime  eventtime_imputed_by_mftmodifiedtime  \
0                          -0.408248                                   0.0   
1                          -0.408248                                   0.0   

   targetvcn  clusterindex  missingfullpathflaglsn  missingful