# 1: Data Import and Initial Exploration (Raw Data) 
This first notebook is for getting the data into memory and a quick first look.

## Data Import & Initial Exploration (Raw Data)

### 1. Code Setup

In [1]:
# Import core libraries
import pandas as pd
import numpy as np
import os

# Set display options for better data visibility
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x) # Keeps floats clean

# Define file paths (adjust these to your actual file names and locations)
# Assuming you've already parsed LogFile and UsnJrnl into a clean CSV format
# This is a crucial step *before* this notebook for getting structured data.
log_file_path = 'data/raw/09-PE-LogFile.csv'
usnjrnl_path = 'data/raw/09-PE-UsnJrnl.csv'


### 2. Import the CSV Data

In [2]:
# Load the datasets
try:
    df_log = pd.read_csv(log_file_path)
    print(f"LogFile data loaded successfully. Shape: {df_log.shape}")
except FileNotFoundError:
    print(f"Error: LogFile CSV not found at {log_file_path}")

try:
    df_usn = pd.read_csv(usnjrnl_path)
    print(f"UsnJrnl data loaded successfully. Shape: {df_usn.shape}")
except FileNotFoundError:
    print(f"Error: UsnJrnl CSV not found at {usnjrnl_path}")

LogFile data loaded successfully. Shape: (25688, 13)
UsnJrnl data loaded successfully. Shape: (249559, 10)


### 3. Initial Data Inspection

In [3]:
# Check LogFile structure
print("\n--- LogFile Initial Inspection ---")
display(df_log.head())
df_log.info(verbose=False, memory_usage='deep')

# Check UsnJrnl structure
print("\n--- UsnJrnl Initial Inspection ---")
display(df_usn.head())
df_usn.info(verbose=False, memory_usage='deep')


--- LogFile Initial Inspection ---


Unnamed: 0,LSN,EventTime(UTC+8),Event,Detail,File/Directory Name,Full Path,CreationTime,ModifiedTime,MFTModifiedTime,AccessedTime,Redo,Target VCN,Cluster Index
0,10006785113,,File Deletion,,UDB-User23847576+RemoteGraph.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/25/23 22:19:43,12/25/23 22:19:48,12/25/23 22:19:48,12/25/23 22:19:48,Deallocate File Record Segment,0x2FB7,6
1,10006785298,12/25/23 22:19:50,File Creation,,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,Initialize File Record Segment,0x2FB7,6
2,10006785525,,Writing Content of Resident File,Writing Size : 512,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,,,,,Update Resident Value,0x2FB7,6
3,10006786167,,Updating Modified Time,ModifiedTime : 2023-12-25 22:19:20 -> 2023-12-...,UDB-User23847576+LocalStorage.sql <Guessed>,\Users\blueangel\AppData\Roaming\Evernote\cond...,,,,,Update Resident Value,0x6E74,6
4,10006786545,,File Deletion,,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,Deallocate File Record Segment,0x2FB7,6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25688 entries, 0 to 25687
Columns: 13 entries, LSN to Cluster Index
dtypes: int64(2), object(11)
memory usage: 17.7 MB

--- UsnJrnl Initial Inspection ---


Unnamed: 0,TimeStamp(UTC+8),USN,File/Directory Name,FullPath,EventInfo,SourceInfo,FileAttribute,Carving Flag,FileReferenceNumber,ParentFileReferenceNumber
0,12/25/23 16:47:05,1677721600,amd64_windows-shield-provider.resources_31bf38...,,File_Closed / File_Deleted,Normal,Directory,,0x000100000007BDEC,0x000300000002CC20
1,12/25/23 16:47:05,1677721848,amd64_windows-shield-provider.resources_31bf38...,,File_Closed / File_Deleted,Normal,Normal,,0x0001000000073F99,0x000300000002CC20
2,12/25/23 16:47:05,1677722120,securityhealthagent.dll.mui,,File_Closed / File_Deleted,Normal,Normal,,0x000100000007BE07,0x000100000007BE06
3,12/25/23 16:47:05,1677722240,windowsdefendersecuritycenter.adml,,File_Closed / File_Deleted,Normal,Normal,,0x000100000007D7AA,0x000100000007BE06
4,12/25/23 16:47:05,1677722368,f,,File_Closed / File_Deleted,Normal,Directory,,0x000100000007BE06,0x000100000007BE05


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249559 entries, 0 to 249558
Columns: 10 entries, TimeStamp(UTC+8) to ParentFileReferenceNumber
dtypes: float64(1), int64(1), object(8)
memory usage: 143.0 MB


## LogFile Data Cleaning

### 1. Standardize Column Names
First, we must standardize the column names for uniformity.

In [4]:
# 1. Standardize column names: lowercase, remove special characters/parentheses, replace spaces with underscores.
df_log.columns = (
    df_log.columns
    .str.lower()
    .str.replace(r'\(.*?\)', '', regex=True) # Remove anything in parentheses (like '(UTC+8)')
    .str.replace('[^a-z0-9_]', '', regex=True) # Remove other non-alphanumeric chars (like slashes)
    .str.replace(' ', '_', regex=False)
    .str.replace('__', '_', regex=False) # Handle double underscores if they result
    .str.strip('_') # Remove leading/trailing underscores
)

# Display the new, clean column names
print("New LogFile Columns:", df_log.columns.tolist())

# Rename specific columns for clarity if needed (e.g., 'eventdetail' is a good name already)
# If your column is now just 'eventdetail', you are good to go.


New LogFile Columns: ['lsn', 'eventtime', 'event', 'detail', 'filedirectoryname', 'fullpath', 'creationtime', 'modifiedtime', 'mftmodifiedtime', 'accessedtime', 'redo', 'targetvcn', 'clusterindex']


### 2. Addressing Missing Values

#### 2.1 Check for Missing Values for all columns
Now we can accurately check for missing values in all columns. 

In [5]:
# Check the total number of entries
total_rows = len(df_log)

# Print total rows for context
print(f"--- Missing Value Report for LogFile (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_log.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for LogFile (Total Rows: 25688) ---
--------------------------------------------------
Columns with Missing Values:


Unnamed: 0,Missing Count,Missing Percentage
detail,18521,72.1
eventtime,15272,59.45
event,12274,47.78
creationtime,7346,28.6
modifiedtime,7346,28.6
mftmodifiedtime,7346,28.6
accessedtime,7346,28.6
fullpath,2004,7.8
filedirectoryname,1008,3.92


--------------------------------------------------


##### 2.1.1 Removing empty values on event
We must remove rows with empty rows in the **event** column because this information provides the causation (the file action) that links timestamps together. For a thesis on timestomping detection, removing eventless records ensures the ML model only trains on high-integrity data, preventing unreliable feature creation and minimizing the risk of low-confidence or false-positive reports in your prototype's output.

In [6]:
# 1. Identify the critical column
event_col = 'event'

# 2. Check for missing values in the 'event' column
total_rows = len(df_log)
missing_event_count = df_log[event_col].isna().sum()

print(f"--- LogFile Event Detail Cleaning ---")
print(f"Total rows before cleaning: {total_rows}")
print(f"Rows with missing '{event_col}': {missing_event_count}")
print(f"Percentage missing: {missing_event_count / total_rows * 100:.2f}%")
print("-" * 50)

# 3. Drop rows where the 'event' column is NaN
# We use inplace=True to modify the DataFrame directly
if missing_event_count > 0:
    df_log.dropna(subset=[event_col], inplace=True)
    
    # 4. Verification
    rows_dropped = missing_event_count
    rows_remaining = len(df_log)
    
    print(f"✅ Successfully dropped {rows_dropped} rows.")
    print(f"Rows remaining in LogFile: {rows_remaining}")
else:
    print("Column 'event' is fully populated. No rows were dropped.")

--- LogFile Event Detail Cleaning ---
Total rows before cleaning: 25688
Rows with missing 'event': 12274
Percentage missing: 47.78%
--------------------------------------------------
✅ Successfully dropped 12274 rows.
Rows remaining in LogFile: 13414


##### 2.1.2 Dropping detail column 
Retaining it would require complex and inefficient Natural Language Processing (NLP) techniques, whereas all necessary context is already captured by the event and timestamp columns, justifying its removal for efficiency.

In [7]:
# Define the column to be dropped
detail_col = 'detail'

if detail_col in df_log.columns:
    # Drop the 'detail' column permanently
    df_log.drop(columns=[detail_col], inplace=True)
    
    print(f"✅ Successfully dropped the '{detail_col}' column.")
    print(f"Columns remaining: {list(df_log.columns)}")
else:
    print(f"Column '{detail_col}' was already dropped or not found.")

✅ Successfully dropped the 'detail' column.
Columns remaining: ['lsn', 'eventtime', 'event', 'filedirectoryname', 'fullpath', 'creationtime', 'modifiedtime', 'mftmodifiedtime', 'accessedtime', 'redo', 'targetvcn', 'clusterindex']


#### 2.2 Check for Missing Values for all columns
Checking if the dropped values of **event** is effective, and address more columns to clear.

In [8]:
# Check the total number of entries
total_rows = len(df_log)

# Print total rows for context
print(f"--- Missing Value Report for LogFile (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_log.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for LogFile (Total Rows: 13414) ---
--------------------------------------------------
Columns with Missing Values:


Unnamed: 0,Missing Count,Missing Percentage
creationtime,7346,54.76
modifiedtime,7346,54.76
mftmodifiedtime,7346,54.76
accessedtime,7346,54.76
eventtime,6485,48.35
fullpath,1823,13.59
filedirectoryname,1008,7.51


--------------------------------------------------


##### 2.2.1 eventtime Imputation starting with creationtime 
We are imputing the missing **eventtime** to ensure every record has a chronological anchor, which is essential for sequencing events and calculating time deltas for the ML model. We use the reliable timestamps (**creationtime**, **modifiedtime**, and **mftmodifiedtime**) for this, but intentionally exclude **accessedtime** from imputation because its high unreliability risks corrupting the logical chronological order. We keep the **accessedtime** column as a separate feature because its inconsistency or presence might still serve as a detectable signal that the ML model can learn to associate with anomalous (timestomped) file activity. 

In the first layer of imputation, we start with creationtime.

In [9]:
# 1. Standardize column names (if not done in the previous cell to ensure reliability)
df_log.columns = (
    df_log.columns
    .str.lower()
    .str.replace(r'\(.*?\)', '', regex=True) # Remove anything in parentheses
    .str.replace('[^a-z0-9_]', '', regex=True)
    .str.replace(' ', '_', regex=False)
    .str.strip('_')
)

# Define the columns (using the exact names found: 'eventtime' and 'creationtime')
event_time_col = 'eventtime'
creation_time_col = 'creationtime'

# 2. Check current state of eventtime
initial_missing_count = df_log[event_time_col].isna().sum()
print(f"Rows with missing '{event_time_col}' before imputation: {initial_missing_count}")

if initial_missing_count > 0:
    # 3. Impute missing 'eventtime' using 'creationtime'
    # We use .fillna() and provide the entire 'creationtime' Series as the filling value.
    df_log[event_time_col] = df_log[event_time_col].fillna(df_log[creation_time_col])

    # 4. Verification Check
    final_missing_count = df_log[event_time_col].isna().sum()
    rows_imputed = initial_missing_count - final_missing_count

    print(f"Rows imputed (filled): {rows_imputed}")
    print(f"Remaining missing '{event_time_col}' after imputation: {final_missing_count}")
    
    if final_missing_count > 0:
        print("\nNote: Some rows still have missing 'eventtime'. This means their corresponding 'creationtime' was also missing.")
        # If necessary, we would drop these remaining rows in a later step if time data is non-negotiable.

    # 5. Display a sample of imputed rows (where the value was previously NaN)
    # This requires running the verification check on the original data, but we can show the fill-in effect:
    print("\nSample of rows (first 5) to show the filled column:")
    display(df_log[[event_time_col, creation_time_col]].head())

else:
    print(f"Column '{event_time_col}' is already complete. No imputation was performed.")

Rows with missing 'eventtime' before imputation: 6485
Rows imputed (filled): 1753
Remaining missing 'eventtime' after imputation: 4732

Note: Some rows still have missing 'eventtime'. This means their corresponding 'creationtime' was also missing.

Sample of rows (first 5) to show the filled column:


Unnamed: 0,eventtime,creationtime
0,12/25/23 22:19:43,12/25/23 22:19:43
1,12/25/23 22:19:50,12/25/23 22:19:50
2,,
3,,
4,12/25/23 22:19:50,12/25/23 22:19:50


##### 2.2.1.2 EventTime Further Adjustment: Timestamp existence checking & Imputation of Latest Available Time. 
If **creationtime** is also missing, use the latest of the remaining valid timestamps, except **accessedtime** for imputation. This is the most conservative estimate for the transaction commit time, avoiding the bias of timestomping. But we should check first if other timestamps are existing when creationtime isn't. If they don't we can drop the row as they don't exist any timestamps, thus no relevant data for training. 

In [10]:
# Define all relevant timestamp columns
event_time_col = 'eventtime'
creation_time_col = 'creationtime'
# Note: accessedtime is NOT included in the reliable list for IMPUTATION, but it IS used for the initial drop check
reliable_ntfs_timestamps = ['creationtime', 'modifiedtime', 'mftmodifiedtime'] 
all_time_cols = [event_time_col] + reliable_ntfs_timestamps + ['accessedtime'] # 5 total columns

# -------------------------------------------------------------
## Step 1: Conditional Early Drop (Based on Forensic Pattern)
# -------------------------------------------------------------
print("--- Step 1: Conditional Early Drop ---")

# 1. Filter the DataFrame for rows where 'creationtime' is missing (NaN)
creation_time_missing_df = df_log[df_log[creation_time_col].isna()]
total_creation_time_missing = len(creation_time_missing_df)

# 2. Check the condition: Are ALL 5 time columns missing in this subset?
# This identifies rows that are truly time-less (missing eventtime, creationtime, modifiedtime, mftmodifiedtime, and accessedtime)
all_time_missing_mask = df_log[all_time_cols].isna().all(axis=1)
count_all_empty = all_time_missing_mask.sum()

# 3. Report the compelling forensic finding (similar to your request)
print(f"Total rows where '{creation_time_col}' is missing: {total_creation_time_missing}")

# Only calculate percentage if there are missing creationtime rows AND they all align with the 'all empty' count
if total_creation_time_missing > 0 and count_all_empty == total_creation_time_missing:
    print(f"Rows where ALL 5 timestamps are ALSO missing: {count_all_empty}")
    print(f"Percentage of creationtime-missing rows where all others are also missing: 100.00%")
elif total_creation_time_missing > 0:
    # If not 100%, show the actual percentage
    percentage = (count_all_empty / total_creation_time_missing) * 100
    print(f"Rows where ALL 5 timestamps are ALSO missing: {count_all_empty}")
    print(f"Percentage of creationtime-missing rows where all others are also missing: {percentage:.2f}%")
else:
    print("No rows found where 'creationtime' is missing.")


if count_all_empty > 0:
    # Drop these rows immediately
    df_log.drop(df_log[all_time_missing_mask].index, inplace=True)
    
    print(f"\nRemoved {count_all_empty} rows that were missing ALL 5 timestamps.")
    print(f"Rows remaining in LogFile: {len(df_log)}")
else:
    print("No completely time-less rows found. Proceeding to imputation.")


# -------------------------------------------------------------
## Step 2: Imputation Priority 2 (Latest of Reliable Times)
# -------------------------------------------------------------
# Note: Imputation Priority 1 (eventtime with creationtime) is assumed to have run previously.
print("\n--- Step 2: Imputation Priority 2 ---")
remaining_missing_before = df_log[event_time_col].isna().sum()

if remaining_missing_before > 0:
    
    # 1. Create a temporary 'LatestTime' column, using only the *reliable* subset
    df_log['latest_reliable_time_temp'] = df_log[reliable_ntfs_timestamps].max(axis=1)

    # 2. Fill the remaining missing 'eventtime' values with this latest available time
    df_log[event_time_col] = df_log[event_time_col].fillna(df_log['latest_reliable_time_temp'])

    # Calculate and report results
    rows_imputed_p2 = remaining_missing_before - df_log[event_time_col].isna().sum()

    print(f"Imputed remaining 'eventtime' with the LATEST RELIABLE available time. Rows filled: {rows_imputed_p2}")
    
    # Drop the temporary column
    df_log.drop(columns=['latest_reliable_time_temp'], inplace=True)

    # -------------------------------------------------------------
    ## Step 3: Final Clean-up (Drop residual empty records)
    # -------------------------------------------------------------
    final_missing_count = df_log[event_time_col].isna().sum()
    print("\n--- Step 3: Final Clean-up ---")

    # 🎯 Added Print: Show the final count of NaNs before the last drop
    print(f"Final EventTime NaNs before drop: {final_missing_count}")

    if final_missing_count > 0:
        df_log.dropna(subset=[event_time_col], inplace=True)
        
        print(f"Final drop: Removed {final_missing_count} residual rows.")
        print(f"Rows remaining in LogFile: {len(df_log)}.")
    else:
        print("All records now have a valid 'eventtime'. No further time-based rows were dropped. ✅")

else:
    print("No further 'eventtime' NaNs found. Cleaning complete. ✅")

--- Step 1: Conditional Early Drop ---
Total rows where 'creationtime' is missing: 7346
Rows where ALL 5 timestamps are ALSO missing: 4732
Percentage of creationtime-missing rows where all others are also missing: 64.42%

Removed 4732 rows that were missing ALL 5 timestamps.
Rows remaining in LogFile: 8682

--- Step 2: Imputation Priority 2 ---
No further 'eventtime' NaNs found. Cleaning complete. ✅


#### 2.3 Check for Missing Values for all columns
Checking if the dropped values of **eventtime** is effective, and address more columns to clear.

In [11]:
# Check the total number of entries
total_rows = len(df_log)

# Print total rows for context
print(f"--- Missing Value Report for LogFile (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_log.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for LogFile (Total Rows: 8682) ---
--------------------------------------------------
Columns with Missing Values:


Unnamed: 0,Missing Count,Missing Percentage
creationtime,2614,30.11
modifiedtime,2614,30.11
mftmodifiedtime,2614,30.11
accessedtime,2614,30.11
fullpath,988,11.38
filedirectoryname,269,3.1


--------------------------------------------------


##### 2.3.1 Dropping Rows with Empty Timestamps Across creationtime, modifiedtime, mftmodifiedtime, and accessedtime 
Upon observing the presented data, if there are an equal amount of empty rows across all timestamps, this could mean that they are the same rows. Any row that doesn't have any timestamps regardless if they have other information, could not provide any relevant information to us or a model. Thus, we drop them.

In [12]:
# Define the columns (assuming previous cleaning steps left them as datetime objects with NaT for missing)
event_time_col = 'eventtime'
ntfs_timestamps = ['creationtime', 'modifiedtime', 'mftmodifiedtime', 'accessedtime']

# 1. Create a mask for the rows that meet the condition
# Condition A: eventtime is NOT missing (~df_log[event_time_col].isna())
# Condition B: ALL four NTFS timestamps ARE missing (df_log[ntfs_timestamps].isna().all(axis=1))
unusable_time_mask = (
    (~df_log[event_time_col].isna()) & 
    (df_log[ntfs_timestamps].isna().all(axis=1))
)

# 2. Check and Report
rows_to_drop = unusable_time_mask.sum()
initial_total_rows = len(df_log)

print("--- Unusable Time Records Drop Check ---")
print(f"Total rows before check: {initial_total_rows}")
print(f"Rows with valid 'eventtime' but NO NTFS timestamps to compare: {rows_to_drop}")

# 3. Conditional Drop
if rows_to_drop > 0:
    df_log.drop(df_log[unusable_time_mask].index, inplace=True)
    
    rows_remaining = len(df_log)
    print(f"✅ Successfully dropped {rows_to_drop} records due to insufficient timestamp evidence.")
    print(f"Rows remaining in LogFile: {rows_remaining}")
else:
    print("No records found with valid 'eventtime' but missing all four NTFS timestamps.")

--- Unusable Time Records Drop Check ---
Total rows before check: 8682
Rows with valid 'eventtime' but NO NTFS timestamps to compare: 2614
✅ Successfully dropped 2614 records due to insufficient timestamp evidence.
Rows remaining in LogFile: 6068


#### 2.4 Check for Missing Values for all columns
Checking if the dropped values of **creationtime**, **modifiedtime**, **mftmodifiedtime**, **accessedtime**, is effective, and address more columns to clear.

In [13]:
# Check the total number of entries
total_rows = len(df_log)

# Print total rows for context
print(f"--- Missing Value Report for LogFile (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_log.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for LogFile (Total Rows: 6068) ---
--------------------------------------------------
Columns with Missing Values:


Unnamed: 0,Missing Count,Missing Percentage
fullpath,457,7.53


--------------------------------------------------


##### 2.4.1 Dropping Empty filedirectoryname Rows 
This is because a log record that has timestamps and an event, but doesn't identify the file it applies to, is forensically vague. It tells you something happened at a certain time but not to what.

In [14]:
# Define the file identifier column
file_name_col = 'filedirectoryname'

# -------------------------------------------------------------
## Step 1: Check and Report Missing File Names
# -------------------------------------------------------------
initial_total_rows = len(df_log)
missing_file_count = df_log[file_name_col].isna().sum()

print("--- File Directory Name Cleaning ---")
print(f"Total rows before cleaning: {initial_total_rows}")
print(f"Rows with missing '{file_name_col}': {missing_file_count}")

# -------------------------------------------------------------
## Step 2: Conditional Drop
# -------------------------------------------------------------
if missing_file_count > 0:
    # Drop rows where the file directory name is NaN (missing the primary grouping key)
    # We drop them in place as they are forensically useless for this analysis.
    df_log.dropna(subset=[file_name_col], inplace=True)
    
    # 3. Verification
    rows_remaining = len(df_log)
    rows_dropped = missing_file_count
    
    print(f"✅ Successfully dropped {rows_dropped} rows due to missing file identifier.")
    print(f"Rows remaining in LogFile: {rows_remaining}")
else:
    print(f"Column '{file_name_col}' is fully populated. No rows were dropped.")

--- File Directory Name Cleaning ---
Total rows before cleaning: 6068
Rows with missing 'filedirectoryname': 0
Column 'filedirectoryname' is fully populated. No rows were dropped.


#### 2.5 Check for Missing Values for all columns
Checking if the dropped values of **filedirectoryname** is effective, and address more columns to clear.

In [15]:
#Check the total number of entries
total_rows = len(df_log)

# Print total rows for context
print(f"--- Missing Value Report for LogFile (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_log.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for LogFile (Total Rows: 6068) ---
--------------------------------------------------
Columns with Missing Values:


Unnamed: 0,Missing Count,Missing Percentage
fullpath,457,7.53


--------------------------------------------------


##### 2.5.1 Imputing filedirectoryname to empty fullpath 
By filling the missing fullpath with the filedirectoryname and creating a binary flag, we maintain the record's primary timing evidence while informing the model that its location data was suspect. 

In [16]:
# Define columns
fullpath_col = 'fullpath'
filename_col = 'filedirectoryname'
flag_col = 'missingfullpathflaglsn'

# 1. Check initial state
initial_missing_count = df_log[fullpath_col].isna().sum()

print("--- FullPath Imputation and Flagging ---")
print(f"Rows with missing '{fullpath_col}' before imputation: {initial_missing_count}")

if initial_missing_count > 0:
    # 2. Create the binary flag column
    # Flag is 1 where fullpath was originally missing, 0 otherwise. This is a valuable feature for the ML model.
    df_log[flag_col] = df_log[fullpath_col].isna().astype(int)
    print(f"✅ Created binary flag column '{flag_col}' to mark original NaNs.")

    # 3. Impute missing fullpath values with the file name
    df_log[fullpath_col] = df_log[fullpath_col].fillna(df_log[filename_col])

    # 4. Report the result
    final_missing_count = df_log[fullpath_col].isna().sum()
    rows_imputed = initial_missing_count - final_missing_count
    
    print(f"Imputed {rows_imputed} rows using '{filename_col}'.")
    print(f"Remaining missing '{fullpath_col}' after imputation: {final_missing_count}")

    # 5. Final cleanup: Drop residual NaNs (where both fullpath and filename were NaN)
    if final_missing_count > 0:
        df_log.dropna(subset=[fullpath_col], inplace=True)
        print(f"Final drop: Removed {final_missing_count} rows where both fullpath and filename were NaN.")
        print(f"Rows remaining in LogFile: {len(df_log)}")
    
else:
    print(f"Column '{fullpath_col}' is fully populated. No imputation needed.")

--- FullPath Imputation and Flagging ---
Rows with missing 'fullpath' before imputation: 457
✅ Created binary flag column 'missingfullpathflaglsn' to mark original NaNs.
Imputed 457 rows using 'filedirectoryname'.
Remaining missing 'fullpath' after imputation: 0


#### 2.6 Check for Missing Values for all columns
Checking if the values of **fullpath** is effective, and address more columns to clear.

In [17]:
#Check the total number of entries
total_rows = len(df_log)

# Print total rows for context
print(f"--- Missing Value Report for LogFile (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_log.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for LogFile (Total Rows: 6068) ---
--------------------------------------------------
🎉 Congratulations! No missing values found in any column.
--------------------------------------------------


### 3. Present the cleaned LogFile table

In [18]:
# Check LogFile structure
print("\n--- LogFile Inspection ---")
display(df_log.head())
df_log.info(verbose=False, memory_usage='deep')
print(f"LogFile data loaded successfully. Shape: {df_log.shape}")


--- LogFile Inspection ---


Unnamed: 0,lsn,eventtime,event,filedirectoryname,fullpath,creationtime,modifiedtime,mftmodifiedtime,accessedtime,redo,targetvcn,clusterindex,missingfullpathflaglsn
0,10006785113,12/25/23 22:19:43,File Deletion,UDB-User23847576+RemoteGraph.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/25/23 22:19:43,12/25/23 22:19:48,12/25/23 22:19:48,12/25/23 22:19:48,Deallocate File Record Segment,0x2FB7,6,0
1,10006785298,12/25/23 22:19:50,File Creation,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,Initialize File Record Segment,0x2FB7,6,0
4,10006786545,12/25/23 22:19:50,File Deletion,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,Deallocate File Record Segment,0x2FB7,6,0
5,10006786739,12/25/23 22:19:43,File Creation,UDB-User23847576+RemoteGraph.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/25/23 22:19:43,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,Initialize File Record Segment,0x2FB7,6,0
8,10006787986,12/25/23 22:19:43,File Deletion,UDB-User23847576+RemoteGraph.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/25/23 22:19:43,12/25/23 22:19:50,12/25/23 22:19:50,12/25/23 22:19:50,Deallocate File Record Segment,0x2FB7,6,0


<class 'pandas.core.frame.DataFrame'>
Index: 6068 entries, 0 to 25686
Columns: 13 entries, lsn to missingfullpathflaglsn
dtypes: int64(3), object(10)
memory usage: 4.5 MB
LogFile data loaded successfully. Shape: (6068, 13)


### 4. Exporting the Cleaned LogFile to data/processed/phase 1 - cleaning folder 


In [19]:
# Observe the format [Sub-Folder Number]-PE-LogFile-Cleaned.csv for consistency 
# Example: 09-PE-LogFile-Cleaned.csv

# Define the target folder path and filename
folder_path = 'data/processed/phase 1 - cleaned'
filename = '09-PE-LogFile-Cleaned.csv'
full_output_path = os.path.join(folder_path, filename)

# Define the columns that need formatting
time_cols = ['eventtime', 'creationtime', 'modifiedtime', 'mftmodifiedtime', 'accessedtime']

# 1. Create a COPY of the DataFrame for string formatting
df_export = df_log.copy()

# 2. Convert all time columns to the desired string format
for col in time_cols:
    if col in df_export.columns:
        # 🎯 EFFICIENT FIX: Safely apply the string format only if the column is currently a datetime dtype.
        # This avoids the slow re-parsing and the UserWarning.
        if pd.api.types.is_datetime64_any_dtype(df_export[col]):
            df_export[col] = df_export[col].dt.strftime('%m/%d/%Y %H:%M:%S')
        # ELSE: If it's a string/object type (like NaT converted to a string), we leave it alone.
            
# 3. Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# 4. Export the formatted DataFrame to CSV with UTF-8 encoding
df_export.to_csv(full_output_path, index=False, encoding='utf-8')

print(f"✅ LogFile table successfully exported to: {full_output_path} and is now ready for Phase 2 - Data Merging")

✅ LogFile table successfully exported to: data/processed/phase 1 - cleaned/09-PE-LogFile-Cleaned.csv and is now ready for Phase 2 - Data Merging


## UsnJrnl Data Cleaning

### 0. Presenting Structure

In [20]:
# Check UsnJrnl structure
print("\n--- UsnJrnl Initial Inspection ---")
display(df_usn.head())
df_usn.info(verbose=False, memory_usage='deep')


--- UsnJrnl Initial Inspection ---


Unnamed: 0,TimeStamp(UTC+8),USN,File/Directory Name,FullPath,EventInfo,SourceInfo,FileAttribute,Carving Flag,FileReferenceNumber,ParentFileReferenceNumber
0,12/25/23 16:47:05,1677721600,amd64_windows-shield-provider.resources_31bf38...,,File_Closed / File_Deleted,Normal,Directory,,0x000100000007BDEC,0x000300000002CC20
1,12/25/23 16:47:05,1677721848,amd64_windows-shield-provider.resources_31bf38...,,File_Closed / File_Deleted,Normal,Normal,,0x0001000000073F99,0x000300000002CC20
2,12/25/23 16:47:05,1677722120,securityhealthagent.dll.mui,,File_Closed / File_Deleted,Normal,Normal,,0x000100000007BE07,0x000100000007BE06
3,12/25/23 16:47:05,1677722240,windowsdefendersecuritycenter.adml,,File_Closed / File_Deleted,Normal,Normal,,0x000100000007D7AA,0x000100000007BE06
4,12/25/23 16:47:05,1677722368,f,,File_Closed / File_Deleted,Normal,Directory,,0x000100000007BE06,0x000100000007BE05


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249559 entries, 0 to 249558
Columns: 10 entries, TimeStamp(UTC+8) to ParentFileReferenceNumber
dtypes: float64(1), int64(1), object(8)
memory usage: 143.0 MB


### 1. Standardize Column Names
First, we must standardize the column names for uniformity.

In [21]:
# 1. Standardize column names: lowercase, remove special characters/parentheses, replace spaces with underscores.
df_usn.columns = (
    df_usn.columns
    .str.lower()
    .str.replace(r'\(.*?\)', '', regex=True) # Remove anything in parentheses (like '(UTC+8)')
    .str.replace('[^a-z0-9_]', '', regex=True) # Remove other non-alphanumeric chars (like slashes)
    .str.replace(' ', '_', regex=False)
    .str.replace('__', '_', regex=False) # Handle double underscores if they result
    .str.strip('_') # Remove leading/trailing underscores
)

# Display the new, clean column names
print("New UsnJrnl Columns:", df_usn.columns.tolist())

# Rename specific columns for clarity if needed (e.g., 'eventdetail' is a good name already)
# If your column is now just 'eventdetail', you are good to go.


New UsnJrnl Columns: ['timestamp', 'usn', 'filedirectoryname', 'fullpath', 'eventinfo', 'sourceinfo', 'fileattribute', 'carvingflag', 'filereferencenumber', 'parentfilereferencenumber']


### 2. Addressing Missing Values

#### 2.1 Check for Missing Values for all columns
Now we can accurately check for missing values in all columns.

In [22]:
# Check the total number of entries
total_rows = len(df_usn)

# Print total rows for context
print(f"--- Missing Value Report for UsnJrnl (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_usn.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for UsnJrnl (Total Rows: 249559) ---
--------------------------------------------------
Columns with Missing Values:


Unnamed: 0,Missing Count,Missing Percentage
carvingflag,249559,100.0
fullpath,114161,45.75


--------------------------------------------------


##### 2.1.1 Dropping carvingflag column 
Since **carvingflag** is 100% missing, the column provides absolutely no data or analytical value.

In [23]:
# Drop the carvingflag column as it is entirely missing (100%)
if 'carvingflag' in df_usn.columns:
    df_usn.drop(columns=['carvingflag'], inplace=True)
    print("✅ Dropped 'carvingflag' column due to 100% missing values.")

✅ Dropped 'carvingflag' column due to 100% missing values.


#### 2.2 Check for Missing Values for all columns
Checking if the dropped column of **carvingflag** is effective, and address more columns to clear.

In [24]:
# Check the total number of entries
total_rows = len(df_usn)

# Print total rows for context
print(f"--- Missing Value Report for UsnJrnl (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_usn.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for UsnJrnl (Total Rows: 249559) ---
--------------------------------------------------
Columns with Missing Values:


Unnamed: 0,Missing Count,Missing Percentage
fullpath,114161,45.75


--------------------------------------------------


##### 2.2.1 Addressing missing fullpath values 
The fullpath column is essential for grouping events and providing location context. Since the missing percentage is manageable and this column is critical for feature engineering, we should impute it. Impute the missing fullpath with filedirectoryname and create a flag to mark where the original path was missing.

In [25]:
# 1. Define columns
filedirectoryname_col = 'filedirectoryname'
fullpath_col = 'fullpath'
flag_col = 'missingfullpathflagusn'
label_col = 'df.usn'

# --- Step 1: Drop carvingflag (100% Missing) ---
if 'carvingflag' in df_usn.columns:
    df_usn.drop(columns=['carvingflag'], inplace=True)
    print("✅ Dropped 'carvingflag' column due to 100% missing values.")

# --- Step 2: FullPath Imputation and Flagging (FIXED) ---
initial_missing_count = df_usn[fullpath_col].isna().sum()

if initial_missing_count > 0 and filedirectoryname_col in df_usn.columns:
    
    print(f"\n--- FullPath Imputation ---")
    
    # 1. Create a binary flag to mark where the fullpath was originally missing
    df_usn[flag_col] = df_usn[fullpath_col].isna().astype(int)
    print(f"✅ Created binary flag column '{flag_col}'.")
    
    # 2. Impute missing fullpath values with the filedirectoryname (SAFE ASSIGNMENT)
    df_usn[fullpath_col] = df_usn[fullpath_col].fillna(df_usn[filedirectoryname_col]) # ⬅️ Put the fixed line HERE

    rows_imputed = initial_missing_count - df_usn[fullpath_col].isna().sum()
    
    print(f"Imputed {rows_imputed} rows using '{filedirectoryname_col}'.")

# --- Step 3: Final Cleanup (Drop residual NaNs in Critical Columns) ---

# Drop rows where 'fullpath' is still missing (meaning filedirectoryname was also missing)
rows_to_drop_fullpath = df_usn[fullpath_col].isna().sum()
if rows_to_drop_fullpath > 0:
    df_usn.dropna(subset=[fullpath_col], inplace=True)
    print(f"Final drop: Removed {rows_to_drop_fullpath} rows where both fullpath and filedirectoryname were missing.")

# Check/Drop rows where the LABEL is missing (critical for ML)
if label_col in df_usn.columns:
    rows_to_drop_label = df_usn[label_col].isna().sum()
    if rows_to_drop_label > 0:
        df_usn.dropna(subset=[label_col], inplace=True)
        print(f"Final drop: Removed {rows_to_drop_label} rows due to missing ML label ('{label_col}').")

print(f"\nUsnJrnl Cleaning Complete. Final Rows: {len(df_usn)}")


--- FullPath Imputation ---
✅ Created binary flag column 'missingfullpathflagusn'.
Imputed 114161 rows using 'filedirectoryname'.

UsnJrnl Cleaning Complete. Final Rows: 249559


#### 2.3 Check for Missing Values for all columns 
Checking if the imputation of  **fullpath** is effective, and if there are more columns need to be addressed.

In [26]:
# Check the total number of entries
total_rows = len(df_usn)

# Print total rows for context
print(f"--- Missing Value Report for UsnJrnl (Total Rows: {total_rows}) ---")
print("-" * 50)

# Use a concise method to count NaNs for all columns
missing_values_report = df_usn.isnull().sum()
missing_percentage_report = (missing_values_report / total_rows) * 100

# Combine the count and percentage into a single DataFrame for clean viewing
missing_df = pd.DataFrame({
    'Missing Count': missing_values_report,
    'Missing Percentage': missing_percentage_report.round(2)
})

# Filter to show only columns with at least one missing value
# (Optional: remove this line if you want to see all columns)
missing_df = missing_df[missing_df['Missing Count'] > 0]

# Sort by the number of missing values (descending)
missing_df.sort_values(by='Missing Count', ascending=False, inplace=True)

# Display the report
if missing_df.empty:
    print("🎉 Congratulations! No missing values found in any column.")
else:
    print("Columns with Missing Values:")
    display(missing_df)

print("-" * 50)

--- Missing Value Report for UsnJrnl (Total Rows: 249559) ---
--------------------------------------------------
🎉 Congratulations! No missing values found in any column.
--------------------------------------------------


### 3. Present the cleaned UsnJrnl

In [27]:
# Check UsnJrnl structure
print("\n--- UsnJrnl  Inspection ---")
display(df_usn.head())
df_usn.info(verbose=False, memory_usage='deep')
print(f"UsnJrnl data loaded successfully. Shape: {df_log.shape}")


--- UsnJrnl  Inspection ---


Unnamed: 0,timestamp,usn,filedirectoryname,fullpath,eventinfo,sourceinfo,fileattribute,filereferencenumber,parentfilereferencenumber,missingfullpathflagusn
0,12/25/23 16:47:05,1677721600,amd64_windows-shield-provider.resources_31bf38...,amd64_windows-shield-provider.resources_31bf38...,File_Closed / File_Deleted,Normal,Directory,0x000100000007BDEC,0x000300000002CC20,1
1,12/25/23 16:47:05,1677721848,amd64_windows-shield-provider.resources_31bf38...,amd64_windows-shield-provider.resources_31bf38...,File_Closed / File_Deleted,Normal,Normal,0x0001000000073F99,0x000300000002CC20,1
2,12/25/23 16:47:05,1677722120,securityhealthagent.dll.mui,securityhealthagent.dll.mui,File_Closed / File_Deleted,Normal,Normal,0x000100000007BE07,0x000100000007BE06,1
3,12/25/23 16:47:05,1677722240,windowsdefendersecuritycenter.adml,windowsdefendersecuritycenter.adml,File_Closed / File_Deleted,Normal,Normal,0x000100000007D7AA,0x000100000007BE06,1
4,12/25/23 16:47:05,1677722368,f,f,File_Closed / File_Deleted,Normal,Directory,0x000100000007BE06,0x000100000007BE05,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249559 entries, 0 to 249558
Columns: 10 entries, timestamp to missingfullpathflagusn
dtypes: int64(2), object(8)
memory usage: 150.9 MB
UsnJrnl data loaded successfully. Shape: (6068, 13)


### 4. Addressing sourceinfo
From the presented table, the **sourceinfo** column seems to contain only one instance: "Normal". Should this be the case, we can drop this column as it has no predicitive power. 

In [28]:
source_col = 'sourceinfo'

if source_col in df_usn.columns:
    
    print(f"--- Unique Values Check for '{source_col}' ---")
    
    # Get the value counts and convert the resulting Series to a DataFrame for clean presentation
    value_counts_df = df_usn[source_col].value_counts(dropna=True).reset_index()
    value_counts_df.columns = ['Source Value', 'Count']
    
    # Handle the count of missing values (NaN) separately
    nan_count = df_usn[source_col].isna().sum()
    
    # Calculate the total number of unique values (including NaN)
    unique_count = df_usn[source_col].nunique(dropna=False)

    print(f"Total unique values (including NaN): {unique_count}")
    print("\nValue Counts:")
    
    # Display the table
    from IPython.display import display
    display(value_counts_df)
    
    if nan_count > 0:
        print(f"Missing Values (NaN) Count: {nan_count}")

    # --- Confirmation/Decision Check ---
    # Case 1: Only 'Normal' and potentially NaNs exist
    if unique_count <= 2 and (unique_count == 1 or (unique_count == 2 and nan_count > 0)):
        print("\n✅ CONFIRMED: The column contains only 'Normal'.")
        print("DECISION: This is a zero-variance feature and should be dropped.")
    else:
        print("\n🛑 WARNING: The column contains more than one substantive unique value and should be retained for feature engineering.")
        
else:
    print(f"Column '{source_col}' not found in df_usn.")

--- Unique Values Check for 'sourceinfo' ---
Total unique values (including NaN): 1

Value Counts:


Unnamed: 0,Source Value,Count
0,Normal,249559



✅ CONFIRMED: The column contains only 'Normal'.
DECISION: This is a zero-variance feature and should be dropped.


#### 4.1 Dropping sourceinfo column
Upon confirmation that there is only one instance to the column, we can now drop it.

In [29]:
# Define the column to be dropped
source_col = 'sourceinfo'

if source_col in df_usn.columns:
    # Drop the 'sourceinfo' column permanently
    df_usn.drop(columns=[source_col], inplace=True)
    
    print(f"✅ Successfully dropped the '{source_col}' column due to zero variance ('Normal' only).")
    print(f"Columns remaining: {list(df_usn.columns)}")
else:
    print(f"Column '{source_col}' was already dropped or not found.")

✅ Successfully dropped the 'sourceinfo' column due to zero variance ('Normal' only).
Columns remaining: ['timestamp', 'usn', 'filedirectoryname', 'fullpath', 'eventinfo', 'fileattribute', 'filereferencenumber', 'parentfilereferencenumber', 'missingfullpathflagusn']


### 5. Present the cleaned UsnJrnl

In [30]:
# Check UsnJrnl structure
print("\n--- UsnJrnl  Inspection ---")
display(df_usn.head())
df_usn.info(verbose=False, memory_usage='deep')
print(f"UsnJrnl data loaded successfully. Shape: {df_usn.shape}")


--- UsnJrnl  Inspection ---


Unnamed: 0,timestamp,usn,filedirectoryname,fullpath,eventinfo,fileattribute,filereferencenumber,parentfilereferencenumber,missingfullpathflagusn
0,12/25/23 16:47:05,1677721600,amd64_windows-shield-provider.resources_31bf38...,amd64_windows-shield-provider.resources_31bf38...,File_Closed / File_Deleted,Directory,0x000100000007BDEC,0x000300000002CC20,1
1,12/25/23 16:47:05,1677721848,amd64_windows-shield-provider.resources_31bf38...,amd64_windows-shield-provider.resources_31bf38...,File_Closed / File_Deleted,Normal,0x0001000000073F99,0x000300000002CC20,1
2,12/25/23 16:47:05,1677722120,securityhealthagent.dll.mui,securityhealthagent.dll.mui,File_Closed / File_Deleted,Normal,0x000100000007BE07,0x000100000007BE06,1
3,12/25/23 16:47:05,1677722240,windowsdefendersecuritycenter.adml,windowsdefendersecuritycenter.adml,File_Closed / File_Deleted,Normal,0x000100000007D7AA,0x000100000007BE06,1
4,12/25/23 16:47:05,1677722368,f,f,File_Closed / File_Deleted,Directory,0x000100000007BE06,0x000100000007BE05,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249559 entries, 0 to 249558
Columns: 9 entries, timestamp to missingfullpathflagusn
dtypes: int64(2), object(7)
memory usage: 137.8 MB
UsnJrnl data loaded successfully. Shape: (249559, 9)


### 6. Export the Cleaned UsnJrnl to data/processed/phase 1 - cleaning folder 

In [31]:

# Observe the format [Sub-Folder Number]-PE-UsnJrnl-Cleaned.csv for consistency
# Example: 09-PE-LogFile-Cleaned.csv

# Define the target folder path and filename
folder_path = 'data/processed/phase 1 - cleaned'
filename = '09-PE-UsnJrnl-Cleaned.csv' 
full_output_path = os.path.join(folder_path, filename)

# Define the columns that need formatting
time_cols = ['timestamp'] 

# 1. Create a COPY of the DataFrame for string formatting
df_export = df_usn.copy() 

# 2. Convert all time columns to the desired string format
for col in time_cols:
    if col in df_export.columns:
        # Safely apply the string format only if the column is currently a datetime dtype.
        if pd.api.types.is_datetime64_any_dtype(df_export[col]):
            df_export[col] = df_export[col].dt.strftime('%m/%d/%Y %H:%M:%S')
        # Otherwise, the column is already a string/object (like a clean NaT string), so we leave it alone.

# 3. Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# 4. Export the formatted DataFrame to CSV with UTF-8 encoding
df_export.to_csv(full_output_path, index=False, encoding='utf-8')

print(f"✅ USNJRNL table successfully exported to: {full_output_path} and is now ready for Phase 2 - Data Merging")

✅ USNJRNL table successfully exported to: data/processed/phase 1 - cleaned/09-PE-UsnJrnl-Cleaned.csv and is now ready for Phase 2 - Data Merging
