# Phase 2.1 - Adding Timestomped Detected Columns 
From the merged LogFile and UsnJrnl, we now add another column that depicts if a certain row is timestomped or not based on the extracted results from Oh et. al's tool: NTFS Log Tracker.

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import os # NEW: Import os for directory creation

# --- Configuration ---
# Input 1: The intermediate merged timeline file for a single case.
MERGED_INPUT_FILEPATH = Path('data/processed/phase 2 - data merged/01-PE-Merged.csv')

# Input 2: The case-specific ground truth file.
GROUND_TRUTH_FILEPATH = Path('data/raw/suspicious/01-PE-Suspicious.csv')

# Output Directory: New directory as requested by the user
NEW_OUTPUT_DIR = Path('data/processed/phase 2.1 - data labeled')

# Output: The labeled file, ready for final master timeline aggregation.
LABELED_OUTPUT_FILEPATH = NEW_OUTPUT_DIR / MERGED_INPUT_FILEPATH.name.replace('-Merged', '-Labeled')

# --- Helper Function for ID Standardization (v12) ---

def to_int_str(s):
    """
    Converts a string (or float) ID into its clean decimal integer string representation. 
    Handles hexadecimal ('0x') prefixes, cleans up floating point notation,
    and aggressively removes non-standard characters. Returns '' if invalid or NaN.
    """
    if pd.isna(s) or (isinstance(s, str) and str(s).strip() == ''):
        return ''
    
    # Handle direct numeric types (like floats/ints)
    if isinstance(s, (int, float)):
        return str(int(s))

    s = str(s).strip()
    
    # Aggressively remove control characters or hidden punctuation.
    s_clean = re.sub(r'[^\w]', '', s).upper()
        
    if not s_clean:
        return ''
        
    s = s_clean

    try:
        # Check for hex prefix and convert accordingly
        if s.startswith('0X'):
            return str(int(s, 16))
        # Otherwise, treat as decimal
        return str(int(s))
    except ValueError:
        return '' # Return empty string for unparseable values

# --- Main Execution ---

def run_label_subfolder():
    """Main function to execute the case-specific ground truth integration."""
    print(f"\n--- Phase 3.2.5: Labeling Subfolder Timeline ({MERGED_INPUT_FILEPATH.name}) ---")
    
    try:
        # 1. Load DataFrames and Determine Case ID
        print("1. Loading Timeline and Ground Truth...")
        
        # Determine Case_ID from the filename (e.g., '01-PE-Merged.csv' -> '1')
        file_prefix = MERGED_INPUT_FILEPATH.name.split('-')[0]
        # Standardize Case ID format (e.g., '01' -> '1')
        CASE_ID = str(int(file_prefix)) 
        print(f"Detected Case ID: {CASE_ID}")
        
        # Load the Merged Timeline
        timeline_df = pd.read_csv(
            MERGED_INPUT_FILEPATH, 
            dtype={'lsn': str, 'usn': str}, # LSN/USN must be strings
            low_memory=False
        )
        print(f"Timeline loaded with {len(timeline_df):,} records.")
        
        # Add the necessary Case_ID column if it's missing (often happens in phase 2)
        timeline_df['Case_ID'] = CASE_ID
        
        # Load the Ground Truth (specific to this case)
        gt_df = pd.read_csv(GROUND_TRUTH_FILEPATH, low_memory=False)
        
        # 2. Prepare Ground Truth for Merging
        # Robust Column Name Standardization for Ground Truth
        gt_df.columns = gt_df.columns.str.strip().str.lower()
        gt_df.rename(columns={'lsn/usn': 'id', 'source': 'source'}, inplace=True, errors='ignore')
        
        # Impute the missing Case_ID, as this GT file is case-specific
        gt_df['Case_ID'] = CASE_ID
        gt_df['Case_ID'] = gt_df['Case_ID'].astype(str).str.strip().apply(lambda x: str(int(x)) if str(x).isdigit() else x)
        gt_df['id'] = gt_df['id'].astype(str).str.strip()
        
        # Filter for confirmed 'Timestamp Manipulation'
        gt_df_filtered = gt_df[
            gt_df['category'].str.contains('Timestamp Manipulation', case=False, na=False)
        ].copy()
        
        gt_df_filtered['Is_Timestomped'] = 1

        # Drop rows where the ID is missing
        gt_keys = gt_df_filtered[['Case_ID', 'id']].dropna().drop_duplicates()
        
        print(f"Ground Truth loaded with {len(gt_keys):,} confirmed 'Timestamp Manipulation' records.")

        # CRITICAL CLEANING: Standardize LSN/USN in both DataFrames
        timeline_df['lsn'] = timeline_df['lsn'].apply(to_int_str)
        timeline_df['usn'] = timeline_df['usn'].apply(to_int_str)
        gt_keys['id'] = gt_keys['id'].apply(to_int_str)
        
        # 3. Create Merge Keys for Brute-Force Matching
        
        # Master Timeline Keys
        timeline_df['Merge_Key_L'] = timeline_df['Case_ID'] + '_' + timeline_df['lsn'].fillna('')
        timeline_df['Merge_Key_U'] = timeline_df['Case_ID'] + '_' + timeline_df['usn'].fillna('')
        
        # Ground Truth Keys (Brute-Force)
        # Create one comprehensive set of keys from the GT: Case_ID + LSN/USN ID
        gt_keys['Merge_Key_GT'] = gt_keys['Case_ID'] + '_' + gt_keys['id']
        
        # Final set of actionable keys (excluding keys with missing ID)
        gt_match_set = {key for key in gt_keys['Merge_Key_GT'].unique() if not key.endswith('_')}

        final_actionable_keys = len(gt_match_set)
        print(f"Filtered Ground Truth Keys (Actionable Set Size): {final_actionable_keys:,}")

        # 4. Optimized Labeling
        timeline_df['Is_Timestomped'] = 0

        # Match 1: Master LSN key is in the comprehensive GT match set
        is_lsn_match = timeline_df['Merge_Key_L'].isin(gt_match_set)
        # Match 2: Master USN key is in the comprehensive GT match set
        is_usn_match = timeline_df['Merge_Key_U'].isin(gt_match_set)

        # Flag the records: if EITHER the LSN key OR the USN key matches the GT key set, set Is_Timestomped to 1
        timeline_df.loc[is_lsn_match | is_usn_match, 'Is_Timestomped'] = 1

        # Drop temporary merge columns and the temporary Case_ID column before saving
        timeline_df.drop(columns=['Merge_Key_L', 'Merge_Key_U'], inplace=True)
        
        # 5. Final Count and Save
        total_anomalies = timeline_df['Is_Timestomped'].sum()
        print(f"\nSuccessfully integrated labels. Total confirmed anomalies found: {total_anomalies:,}")
        
        # CRITICAL ADDITION: Ensure the new output directory exists
        os.makedirs(LABELED_OUTPUT_FILEPATH.parent, exist_ok=True)
        
        # Save the final labeled dataset
        timeline_df.to_csv(LABELED_OUTPUT_FILEPATH, index=False)
        print(f"✅ Final labeled dataset saved to: {LABELED_OUTPUT_FILEPATH}")

    except FileNotFoundError:
        print(f"\n❌ ERROR: One or more input files not found.")
        print(f"Ensure the timeline exists at: {MERGED_INPUT_FILEPATH}")
        print(f"Ensure the ground truth exists at: {GROUND_TRUTH_FILEPATH}")
    except ValueError as e:
        print(f"\n❌ A data format error occurred during labeling: {e}")
    except Exception as e:
        print(f"\n❌ An unexpected error occurred during labeling: {e}")


if __name__ == '__main__':
    run_label_subfolder()



--- Phase 3.2.5: Labeling Subfolder Timeline (01-PE-Merged.csv) ---
1. Loading Timeline and Ground Truth...
Detected Case ID: 1
Timeline loaded with 325,420 records.
Ground Truth loaded with 2 confirmed 'Timestamp Manipulation' records.
Filtered Ground Truth Keys (Actionable Set Size): 2

Successfully integrated labels. Total confirmed anomalies found: 1
✅ Final labeled dataset saved to: data/processed/phase 2.1 - data labeled/01-PE-Labeled.csv
