# 2.1 Merging All Sub-Folders 
Now that we've combined the CSVs from LogFile and UsnJrnl according to their Case ID (01, 02, 03, ..., 12), we are now going to merge them all to make one master timeline.

In [4]:
import pandas as pd
import os
from pathlib import Path
from IPython.display import display

# --- Configuration ---
# Directory containing the 12 individual merged case files (01-PE-Merged.csv, etc.)
INPUT_DIR = Path('data/processed/phase 2 - data merged') 
# New directory for the final, consolidated output
OUTPUT_DIR = 'data/processed/phase 2.1 - data merged (all sub-folders)'
OUTPUT_FILENAME = 'MASTER_TIMELINE_ALL_CASES.csv'
OUTPUT_FILEPATH = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)

# --- Aggregation Logic ---
all_cases_data = []
total_rows_input = 0
case_numbers = range(1, 13) # Corresponds to 01 through 12

print("--- Starting Phase 2.1: Comprehensive Aggregation ---")

for case_num in case_numbers:
    # Format the case number to be 01, 02, 03, etc.
    case_id = str(case_num).zfill(2)
    filename = f'{case_id}-PE-Merged.csv'
    filepath = INPUT_DIR / filename
    
    print(f"\nProcessing Case ID {case_id}...")
    
    try:
        # Load the individual merged file
        # low_memory=False is essential for large files
        df_case = pd.read_csv(filepath, low_memory=False)
        
        # --- CRUCIAL STEP: Add the Case Identifier ---
        # This column is vital for later filtering and model evaluation
        df_case['Case_ID'] = case_id 
        
        rows_loaded = len(df_case)
        total_rows_input += rows_loaded
        
        # Append the DataFrame to the list
        all_cases_data.append(df_case)
        
        print(f"  ✅ Loaded {rows_loaded:,} rows. Case_ID column added.")
        
    except FileNotFoundError:
        print(f"  ❌ WARNING: File not found for Case ID {case_id} at: {filepath}")
    except Exception as e:
        print(f"  ❌ ERROR processing Case ID {case_id}: {e}")

# --- Final Concatenation ---
print("\n--- Concatenating all individual cases into Master Timeline ---")

if all_cases_data:
    # Vertical merge all DataFrames in the list
    df_master_all = pd.concat(all_cases_data, ignore_index=True)
    
    final_rows = len(df_master_all)

    # --- Quality Control Check (QC) ---
    print("\n--- Row Count Verification ---")
    print(f"Total rows loaded from all individual files: {total_rows_input:,}")
    print(f"Total rows in final df_master_all:          {final_rows:,}")

    if total_rows_input == final_rows:
        print("✅ QC PASS: The aggregation was successful. No rows were lost.")
    else:
        print("❌ QC FAIL: Rows were lost during concatenation. Check the input count and concatenation logic.")

    print("\n--- Final Reordering, Sorting, and Export ---")
    
    # --- 1. Column Reordering: Move Case_ID to the first position ---
    cols = df_master_all.columns.tolist()
    # Ensure Case_ID is removed from its current position
    if 'Case_ID' in cols:
        cols.remove('Case_ID')
        # Insert Case_ID at index 0 (the first position)
        cols.insert(0, 'Case_ID')
    
    df_master_all = df_master_all[cols]
    print("✅ Columns reordered: 'Case_ID' is now the first column.")

    # --- 2. Row Sorting: Sort by Case_ID then Chronologically ---
    # Sorting by Case_ID first groups the data by case, making it easier to navigate.
    df_master_all.sort_values(by=['Case_ID', 'timestamp_primary'], inplace=True)
    print("✅ Rows sorted by 'Case_ID' then 'timestamp_primary'.")
    
    # --- Ensure output directory exists ---
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # --- Export the Master Timeline ---
    df_master_all.to_csv(
        OUTPUT_FILEPATH, 
        index=False, 
        encoding='utf-8', 
        date_format='%Y-%m-%d %H:%M:%S'
    )

--- Starting Phase 2.1: Comprehensive Aggregation ---

Processing Case ID 01...
  ✅ Loaded 325,420 rows. Case_ID column added.

Processing Case ID 02...
  ✅ Loaded 251,933 rows. Case_ID column added.

Processing Case ID 03...
  ✅ Loaded 250,718 rows. Case_ID column added.

Processing Case ID 04...
  ✅ Loaded 266,559 rows. Case_ID column added.

Processing Case ID 05...
  ✅ Loaded 273,890 rows. Case_ID column added.

Processing Case ID 06...
  ✅ Loaded 267,499 rows. Case_ID column added.

Processing Case ID 07...
  ✅ Loaded 253,532 rows. Case_ID column added.

Processing Case ID 08...
  ✅ Loaded 254,103 rows. Case_ID column added.

Processing Case ID 09...
  ✅ Loaded 255,627 rows. Case_ID column added.

Processing Case ID 10...
  ✅ Loaded 255,255 rows. Case_ID column added.

Processing Case ID 11...
  ✅ Loaded 267,443 rows. Case_ID column added.

Processing Case ID 12...
  ✅ Loaded 268,746 rows. Case_ID column added.

--- Concatenating all individual cases into Master Timeline ---

--- 