## 1. File Paths and Setup

In [1]:
import pandas as pd
import numpy as np
import os

# --- 1. File Paths and Setup ---
LOGFILE_PATH = 'data/raw/02-PE-LogFile.csv'
USNJRNL_PATH = 'data/raw/02-PE-UsnJrnl.csv'
SUSPICIOUS_PATH = 'data/raw/suspicious/02-PE-Suspicious.csv'
OUTPUT_DIR = 'data/processed/Phase 1 - Data Labeling'

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Starting Phase 1: Data Labeling...")
print("-" * 40)



Starting Phase 1: Data Labeling...
----------------------------------------


## 2. Load Datasets 

In [2]:
# --- 2. Load Datasets and Standardize Columns ---
try:
    df_logfile = pd.read_csv(LOGFILE_PATH, low_memory=False)
    df_usnjrnl = pd.read_csv(USNJRNL_PATH, low_memory=False)
    df_suspicious = pd.read_csv(SUSPICIOUS_PATH, low_memory=False)

    # --- NEW: Standardize all column names to lowercase ---
    df_logfile.columns = df_logfile.columns.str.lower()
    df_usnjrnl.columns = df_usnjrnl.columns.str.lower()
    df_suspicious.columns = df_suspicious.columns.str.lower()
    
    # Check for the key columns now in lowercase (lsn and usn)
    if 'lsn' not in df_logfile.columns:
        print("Error: 'lsn' column not found in LogFile even after converting to lowercase. Please check the original column name.")
        # Print actual columns for debugging if the error persists
        # print("LogFile Columns:", df_logfile.columns.tolist()) 
        exit()
        
    if 'usn' not in df_usnjrnl.columns:
        print("Error: 'usn' column not found in UsnJrnl even after converting to lowercase. Please check the original column name.")
        exit()
        
    print("Successfully standardized column names to lowercase.")
    print(f"Loaded LogFile with {len(df_logfile)} records.")
    print(f"Loaded UsnJrnl with {len(df_usnjrnl)} records.")
    print(f"Loaded Suspicious with {len(df_suspicious)} records.")

except FileNotFoundError as e:
    print(f"Error: Required file not found. Check file paths: {e}")
    exit()

# Clean up column names for joining: 'lsn/usn' is now 'lsn/usn' (lowercase)
df_suspicious.rename(columns={'lsn/usn': 'id'}, inplace=True)



Successfully standardized column names to lowercase.
Loaded LogFile with 14783 records.
Loaded UsnJrnl with 247386 records.
Loaded Suspicious with 3 records.


## 3. Initialize Label Columns 

In [3]:
# --- 3. Initialize Label Columns ---

# Initialize new columns to 0 in both DataFrames
df_logfile['is_timestomped'] = 0
df_logfile['is_suspicious_execution'] = 0

df_usnjrnl['is_timestomped'] = 0
df_usnjrnl['is_suspicious_execution'] = 0

print("Initialized new label columns to 0.")



Initialized new label columns to 0.


## 4. Process and Apply Labels from Suspicious Data 

In [4]:
# --- 4. Process and Apply Labels from Suspicious Data ---

# Split suspicious findings by source (which is now lowercase)
suspicious_logfile = df_suspicious[df_suspicious['source'] == 'logfile'].copy()
suspicious_usnjrnl = df_suspicious[df_suspicious['source'] == 'usnjrnl'].copy()

# A helper function to apply the labels
def apply_labels(df, suspicious_df, id_col):
    # Standardize column for category checks
    df_temp = suspicious_df.copy()

    # --- Timestomping Label ---
    # Filter suspicious records that are categorized as Timestamp Manipulation
    ts_ids = df_temp[df_temp['category'] == 'Timestamp Manipulation']['id'].unique()
    
    # Check if any IDs were found
    if len(ts_ids) > 0:
        # Match the IDs and set the label to 1
        # This line now safely uses the lowercase id_col ('lsn' or 'usn') 
        # because the main DataFrame columns were standardized above.
        df.loc[df[id_col].isin(ts_ids), 'is_timestomped'] = 1
    
    # --- Suspicious Execution Label ---
    # Filter suspicious records that are categorized as Execution of Suspicious Programs
    exec_ids = df_temp[df_temp['category'] == 'Execution of Suspicious Programs']['id'].unique()
    
    if len(exec_ids) > 0:
        # Match the IDs and set the label to 1
        df.loc[df[id_col].isin(exec_ids), 'is_suspicious_execution'] = 1
        
    return len(ts_ids), len(exec_ids)

# --- Apply labels to LogFile ---
# The ID column in df_logfile is now guaranteed to be 'lsn' (lowercase)
ts_log_count, exec_log_count = apply_labels(df_logfile, suspicious_logfile, 'lsn')
print(f"LogFile: Labeled {ts_log_count} unique IDs as is_timestomped (lsn match).")
print(f"LogFile: Labeled {exec_log_count} unique IDs as is_suspicious_execution (lsn match).")

# --- Apply labels to UsnJrnl ---
# The ID column in df_usnjrnl is now guaranteed to be 'usn' (lowercase)
ts_usn_count, exec_usn_count = apply_labels(df_usnjrnl, suspicious_usnjrnl, 'usn')
print(f"UsnJrnl: Labeled {ts_usn_count} unique IDs as is_timestomped (usn match).")
print(f"UsnJrnl: Labeled {exec_usn_count} unique IDs as is_suspicious_execution (usn match).")

# --- Verification Check ---
total_labeled_unique_ids = (ts_log_count + exec_log_count) + (ts_usn_count + exec_usn_count)
print(f"\nTotal unique suspicious IDs found and labeled in the logs: {total_labeled_unique_ids}")
print(f"Total entries in suspicious CSV: {len(df_suspicious)}")
if total_labeled_unique_ids < len(df_suspicious):
    print(f"NOTE: Labeled unique IDs ({total_labeled_unique_ids}) is less than total entries in suspicious CSV ({len(df_suspicious)}).")
    print("This is expected if the suspicious CSV has duplicate IDs or one ID is flagged for BOTH timestomping and execution.")



LogFile: Labeled 1 unique IDs as is_timestomped (lsn match).
LogFile: Labeled 1 unique IDs as is_suspicious_execution (lsn match).
UsnJrnl: Labeled 0 unique IDs as is_timestomped (usn match).
UsnJrnl: Labeled 1 unique IDs as is_suspicious_execution (usn match).

Total unique suspicious IDs found and labeled in the logs: 3
Total entries in suspicious CSV: 3


## 5. Output Labeled Datasets

In [5]:
# --- 5. Output Labeled Datasets ---
logfile_output_path = os.path.join(OUTPUT_DIR, '02-PE-LogFile_labeled.csv')
usnjrnl_output_path = os.path.join(OUTPUT_DIR, '02-PE-UsnJrnl_labeled.csv')

df_logfile.to_csv(logfile_output_path, index=False)
df_usnjrnl.to_csv(usnjrnl_output_path, index=False)

print(f"\nLogFile saved to: {logfile_output_path}")
print(f"UsnJrnl saved to: {usnjrnl_output_path}")

print("-" * 40)
print("Phase 1: Data Labeling Complete.")


LogFile saved to: data/processed/Phase 1 - Data Labeling/02-PE-LogFile_labeled.csv
UsnJrnl saved to: data/processed/Phase 1 - Data Labeling/02-PE-UsnJrnl_labeled.csv
----------------------------------------
Phase 1: Data Labeling Complete.
