In [3]:
# Phase 4 & 5: Final Feature Preparation, Scaling, and Isolation Forest Training
# FIX: Includes robust conversion of hexadecimal strings (e.g., '0x393') to decimal 
# integers for columns like 'targetvcn' to prevent the ValueError during scaling.

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Set display options for better data visibility
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 30)

# --- Configuration ---
INPUT_DIR = 'data/processed/phase 3 - feature engineered'
INPUT_FILENAME = 'MASTER_TIMELINE_FEATURES.csv'
INPUT_FILEPATH = Path(INPUT_DIR) / INPUT_FILENAME

# Model Configuration
CONTAMINATION_RATE = 0.01 

# Columns that are NOT features and should be dropped (IDs, raw timestamps, unused text)
COLUMNS_TO_DROP = [
    'Case_ID', 
    'creationtime', 'modifiedtime', 'mftmodifiedtime', 'accessedtime', 'timestamp_primary',
    'fullpath', 'filedirectoryname', 'eventinfo', 'redo' 
]

# Categorical columns that need Frequency Encoding
CATEGORICAL_FEATURES = [
    'event', 
    'source'
]

# Columns that are expected to be numeric but often contain hex strings
HEX_TO_NUMERIC_COLS = [
    'targetvcn', 
    'filereferencenumber', 
    'parentfilereferencenumber',
    'lsn',
    'usn'
]

# --- Helper Functions ---

def load_data(filepath):
    """Loads the CSV and converts necessary columns to datetime objects."""
    print(f"Loading data from: {filepath}")
    df = pd.read_csv(filepath, dtype={'Case_ID': str}, low_memory=False)
    time_cols = ['timestamp_primary', 'creationtime', 'modifiedtime', 'mftmodifiedtime', 'accessedtime']
    for col in time_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    print(f"Data loaded successfully with {len(df):,} rows.")
    return df

def convert_hex_to_decimal(df, cols):
    """Converts columns containing hex strings (like '0x393') to decimal integers."""
    print(f"Converting potential hex strings to decimal in columns: {cols}")
    df_temp = df.copy()
    
    for col in cols:
        if col in df_temp.columns:
            # First, fill NaNs with 0 to make conversion simpler
            df_temp[col] = df_temp[col].fillna(0)
            
            # Apply conversion logic
            def hex_to_dec(val):
                if isinstance(val, (int, float)):
                    # Already numeric
                    return val
                try:
                    s = str(val).strip().upper()
                    if s.startswith('0X'):
                        return int(s, 16)
                    return float(s) # Tries to convert non-hex string to float
                except ValueError:
                    return 0 # Default to 0 if conversion fails
                except TypeError:
                    return 0

            df_temp[col] = df_temp[col].apply(hex_to_dec).astype(float)
            
    print("  Hex conversion complete.")
    return df_temp

def preprocess_features(df):
    """Executes the final feature preparation steps: column dropping, encoding, and cleaning."""
    df_processed = df.copy()
    
    # Pre-step: Convert hex strings to decimal where necessary
    df_processed = convert_hex_to_decimal(df_processed, HEX_TO_NUMERIC_COLS)
    
    # 1. Drop Non-Feature Columns
    print("\n1. Dropping non-feature columns (IDs, raw timestamps, path text)...")
    df_processed = df_processed.drop(columns=[col for col in COLUMNS_TO_DROP if col in df_processed.columns])
    
    # 2. Frequency Encoding
    print("\n2. Applying Frequency Encoding to categorical features...")
    for col in CATEGORICAL_FEATURES:
        if col in df_processed.columns:
            freq_map = df_processed[col].value_counts(normalize=True).to_dict()
            df_processed[f'{col}_freq_encoded'] = df_processed[col].map(freq_map).fillna(0)
            df_processed = df_processed.drop(columns=[col])
            print(f"  Encoded and dropped original column: '{col}'")

    # 3. Handle remaining NaNs in numeric columns (Impute with 0)
    numeric_cols = df_processed.select_dtypes(include=np.number).columns
    df_processed[numeric_cols] = df_processed[numeric_cols].fillna(0)
    
    # Identify the final feature matrix X and the key columns
    key_cols = ['lsn', 'usn']
    X = df_processed.drop(columns=[col for col in key_cols if col in df_processed.columns], errors='ignore')
    
    print(f"\nFinal Feature Matrix (X) shape: {X.shape}")
    return X, df_processed[key_cols].fillna(np.nan).astype(str)

def train_and_score_isolation_forest(X, keys):
    """Applies scaling, trains the IF model, and generates anomaly scores."""

    # 1. Feature Scaling
    print("\n3. Applying Standard Scaling...")
    scaler = StandardScaler()
    
    # Fit the scaler to the data and transform it
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

    # 2. Isolation Forest Training
    print(f"\n4. Training Isolation Forest with contamination rate: {CONTAMINATION_RATE}...")
    model = IsolationForest(
        n_estimators=100, 
        contamination=CONTAMINATION_RATE, 
        random_state=42,
        max_samples='auto',
        n_jobs=-1
    )
    
    model.fit(X_scaled)
    print("  Isolation Forest training complete.")

    # 3. Generate Anomaly Scores and Predictions
    X_scaled_df['anomaly_score'] = model.decision_function(X_scaled)
    X_scaled_df['anomaly_label'] = model.predict(X_scaled) # -1 is anomaly, 1 is normal
    
    # Merge keys back for identification
    X_scaled_df = pd.concat([keys.reset_index(drop=True), X_scaled_df], axis=1)

    return X_scaled_df

# --- Main Execution ---
if __name__ == '__main__':
    print("\n--- Phase 4 & 5: Anomaly Detection Pipeline ---")
    
    try:
        # 1. Load Data
        df_master = load_data(INPUT_FILEPATH)

        # 2. Preprocess Features (Fixed Hex Conversion, Drop metadata and Encode)
        X_features, key_identifiers = preprocess_features(df_master)
        
        # 3. Train and Score Model
        results_df = train_and_score_isolation_forest(X_features, key_identifiers)
        
        # 4. Display Final Results
        anomalies_detected = (results_df['anomaly_label'] == -1).sum()
        print("\n--- Results Summary ---")
        print(f"Total records analyzed: {len(results_df):,}")
        print(f"Records classified as anomalies (-1) based on contamination={CONTAMINATION_RATE}: {anomalies_detected:,}")
        
        print("\nTop 5 Most Anomalous Records (Lowest Score):")
        top_anomalies = results_df.sort_values(by='anomaly_score', ascending=True).head(5)
        # Display the key identifiers, score, and the delta features that likely caused the anomaly
        print(top_anomalies[['lsn', 'usn', 'anomaly_score', 'anomaly_label', 'Delta_M_vs_C', 'Delta_MFTM_vs_M', 'event_freq_encoded']])

    except FileNotFoundError:
        print(f"\n❌ ERROR: Input file not found. Ensure '{INPUT_FILEPATH}' exists.")
    except Exception as e:
        print(f"\n❌ An unexpected error occurred during processing: {e}")



--- Phase 4 & 5: Anomaly Detection Pipeline ---
Loading data from: data/processed/phase 3 - feature engineered/MASTER_TIMELINE_FEATURES.csv
Data loaded successfully with 2,239,418 rows.
Converting potential hex strings to decimal in columns: ['targetvcn', 'filereferencenumber', 'parentfilereferencenumber', 'lsn', 'usn']
  Hex conversion complete.

1. Dropping non-feature columns (IDs, raw timestamps, path text)...

2. Applying Frequency Encoding to categorical features...
  Encoded and dropped original column: 'event'
  Encoded and dropped original column: 'source'

Final Feature Matrix (X) shape: (2239418, 15)

3. Applying Standard Scaling...

❌ An unexpected error occurred during processing: could not convert string to float: 'Archive / Repasre_Point / Sparse'
