2. Feature Engineering and Data Merging 

# 1. Import Libraries and Load the DataFrame

In [1]:
# Import core libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path
from IPython.display import display # Import display for clean table rendering


# Set display options for better data visibility
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x) # Keeps floats clean

# --- Configuration ---
# Adjust the names if need be
data_dir = Path('data/processed/phase 1 - cleaned')
df_log = '11-PE-LogFile-Cleaned.csv' # CLEANED LOGFILE
df_usn = '11-PE-UsnJrnl-Cleaned.csv' # CLEANING USNJRNL

# Load the datasets
try:
    df_log = pd.read_csv(data_dir / df_log)
    print(f"LogFile data loaded successfully. Shape: {df_log.shape}")
except FileNotFoundError:
    print(f"Error: LogFile CSV not found at {data_dir / df_log}")

try:
    df_usn = pd.read_csv(data_dir / df_usn)
    print(f"UsnJrnl data loaded successfully. Shape: {df_usn.shape}")
except FileNotFoundError:
    print(f"Error: UsnJrnl CSV not found at {data_dir / df_usn}")


# Check LogFile structure
print("\n--- LogFile Initial Inspection ---")
display(df_log.head())
df_log.info(verbose=False, memory_usage='deep')

# Check UsnJrnl structure
print("\n--- UsnJrnl Initial Inspection ---")
display(df_usn.head())
df_usn.info(verbose=False, memory_usage='deep')



LogFile data loaded successfully. Shape: (3011, 13)
UsnJrnl data loaded successfully. Shape: (264432, 9)

--- LogFile Initial Inspection ---


Unnamed: 0,lsn,eventtime,event,filedirectoryname,fullpath,creationtime,modifiedtime,mftmodifiedtime,accessedtime,redo,targetvcn,clusterindex,missingfullpathflaglsn
0,10572439919,12/28/23 21:46:59,File Creation,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,Initialize File Record Segment,0x2F97,0,0
1,10572441177,12/28/23 21:46:59,File Deletion,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,Deallocate File Record Segment,0x2F97,0,0
2,10572441362,12/28/23 21:46:59,File Creation,UDB-User23847576+RemoteGraph.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,Initialize File Record Segment,0x2F97,0,0
3,10572442609,12/28/23 21:46:59,File Deletion,UDB-User23847576+RemoteGraph.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,Deallocate File Record Segment,0x2F97,0,0
4,10572442803,12/28/23 21:46:59,File Creation,UDB-User23847576+RemoteGraph.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,12/28/23 21:46:59,Initialize File Record Segment,0x2F97,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3011 entries, 0 to 3010
Columns: 13 entries, lsn to missingfullpathflaglsn
dtypes: int64(3), object(10)
memory usage: 2.3 MB

--- UsnJrnl Initial Inspection ---


Unnamed: 0,timestamp,usn,filedirectoryname,fullpath,eventinfo,fileattribute,filereferencenumber,parentfilereferencenumber,missingfullpathflagusn
0,12/27/23 2:44:19,1719664640,UDB-User23847576+RemoteGraph.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,File_Closed / File_Deleted,Archive,0x00140000000002D8,0x000200000001B9B1,0
1,12/27/23 2:44:20,1719664784,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,File_Created,Archive,0x00150000000002D8,0x000200000001B9B1,0
2,12/27/23 2:44:20,1719664928,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,File_Created / Data_Added,Archive,0x00150000000002D8,0x000200000001B9B1,0
3,12/27/23 2:44:20,1719665072,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,File_Created / Data_Added / Data_Overwritten,Archive,0x00150000000002D8,0x000200000001B9B1,0
4,12/27/23 2:44:20,1719665216,UDB-User23847576+LocalStorage.sql-journal,\Users\blueangel\AppData\Roaming\Evernote\cond...,File_Created / Data_Added / Data_Overwritten /...,Archive,0x00150000000002D8,0x000200000001B9B1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264432 entries, 0 to 264431
Columns: 9 entries, timestamp to missingfullpathflagusn
dtypes: int64(2), object(7)
memory usage: 150.2 MB


# 2. Convert Timestamps
When the DataFrames were saved to CSV, the precise datetime objects were converted into strings. Upon reading them back, pandas interprets these as strings, which is why the df.info() output shows them as objects (string type). The time columns must be converted back to datetime64[ns] before any time-based feature engineering, such as calculating time deltas, can be performed.

In [2]:
LOG_TIME_COLS = ['eventtime', 'creationtime', 'modifiedtime', 'mftmodifiedtime', 'accessedtime']
USN_TIME_COLS = ['timestamp']

# CORRECTED FORMAT: Using %y (lowercase) for 2-digit year (e.g., '23')
TIME_FORMAT_FIXED = '%m/%d/%y %H:%M:%S' 

print("--- Re-converting Time Columns with Corrected Format ('%m/%d/%y %H:%M:%S') ---")

def convert_time_cols_fixed(df, time_cols, format_str):
    for col in time_cols:
        if col in df.columns and df[col].dtype == 'object':
            # Only attempt conversion if the column is still an 'object' (string)
            df[col] = pd.to_datetime(df[col], format=format_str, errors='coerce')
    return df

# Re-run conversion on all LogFile time columns
df_log = convert_time_cols_fixed(df_log, LOG_TIME_COLS, TIME_FORMAT_FIXED)
        
# Re-run conversion on the UsnJrnl timestamp column
df_usn = convert_time_cols_fixed(df_usn, USN_TIME_COLS, TIME_FORMAT_FIXED)
        
print("✅ Time conversion complete.")

# --- Verification ---
print("\n--- Verification of df_log['eventtime'] ---")
print(f"Dtype: {df_log['eventtime'].dtype}")
print(f"First 10 values (MUST NOT be NaT):\n{df_log['eventtime'].head(10)}")

# Check how many successful conversions happened
valid_log_times = df_log['eventtime'].count()
valid_usn_times = df_usn['timestamp'].count()
print(f"\nTotal valid eventtime (LogFile): {valid_log_times} out of {len(df_log)}")
print(f"Total valid timestamp (UsnJrnl): {valid_usn_times} out of {len(df_usn)}")

--- Re-converting Time Columns with Corrected Format ('%m/%d/%y %H:%M:%S') ---
✅ Time conversion complete.

--- Verification of df_log['eventtime'] ---
Dtype: datetime64[ns]
First 10 values (MUST NOT be NaT):
0   2023-12-28 21:46:59
1   2023-12-28 21:46:59
2   2023-12-28 21:46:59
3   2023-12-28 21:46:59
4   2023-12-28 21:46:59
5   2023-12-28 21:46:59
6   2023-12-28 21:46:59
7   2023-12-28 21:46:59
8   2023-12-28 21:46:59
9   2023-12-28 21:46:59
Name: eventtime, dtype: datetime64[ns]

Total valid eventtime (LogFile): 3011 out of 3011
Total valid timestamp (UsnJrnl): 264432 out of 264432


# 3. Final Column Check & Comparision Before Merging 
Display all columns and their data types of our two datasets (LogFile & UsnJrnl). By comparing them, we can deduce how to merge them best.

In [3]:
import pandas as pd
from IPython.display import display

# Assuming df_log and df_usn are loaded and have had their time columns converted

# Get the dtype Series for each DataFrame
log_cols = pd.Series(df_log.dtypes, name='LogFile Dtype').astype(str)
usn_cols = pd.Series(df_usn.dtypes, name='UsnJrnl Dtype').astype(str)

print("--- Column Comparison: LogFile vs. UsnJrnl ---")

# Align the two Series on their index (column names) and fill missing entries
# This creates a perfect side-by-side comparison table
comparison_df = pd.concat([log_cols, usn_cols], axis=1).fillna('--- Not Present ---')
comparison_df.index.name = 'Column Name' 

# Use display() to render the DataFrame as a clean, formatted HTML table
display(comparison_df)

--- Column Comparison: LogFile vs. UsnJrnl ---


Unnamed: 0_level_0,LogFile Dtype,UsnJrnl Dtype
Column Name,Unnamed: 1_level_1,Unnamed: 2_level_1
lsn,int64,--- Not Present ---
eventtime,datetime64[ns],--- Not Present ---
event,object,--- Not Present ---
filedirectoryname,object,object
fullpath,object,object
creationtime,datetime64[ns],--- Not Present ---
modifiedtime,datetime64[ns],--- Not Present ---
mftmodifiedtime,datetime64[ns],--- Not Present ---
accessedtime,datetime64[ns],--- Not Present ---
redo,object,--- Not Present ---


In [4]:
print(df_log['eventtime'].head(10))
print(df_log['eventtime'].dtype)

0   2023-12-28 21:46:59
1   2023-12-28 21:46:59
2   2023-12-28 21:46:59
3   2023-12-28 21:46:59
4   2023-12-28 21:46:59
5   2023-12-28 21:46:59
6   2023-12-28 21:46:59
7   2023-12-28 21:46:59
8   2023-12-28 21:46:59
9   2023-12-28 21:46:59
Name: eventtime, dtype: datetime64[ns]
datetime64[ns]


# 4. Data Merging

**1. Standardization**
* Rename the primary timestamp column in both DataFrames to a common name: timestamp_primary. 
* This aims to combine LogFile's eventtime column with UsnJrnl's timestamp column as they both pertain to the **definitive time of the event**

**2. Source Flagging**
* Create a source column (LogFile or UsnJrnl) to distinguish the origin of each record after merging.
* This is to ensure that even after the vertical merge, we can tell which system generated the record. 

**3. Vertical Concatenation (pd.concat)**
* Stack the records from df_log on top of df_usn. Missing columns will automatically be filled with NaN or NaT (Not a Time).
* The simplest and most effective wat to combine two datasets

**4. Chronological Sort**
* Sort the final master DataFrame by the new timestamp_primary to create a single, unified timeline.



In [5]:

print("--- Starting Vertical Concatenation to Master Timeline ---")

# --- 1. PREP: LogFile (df_log) ---
print("\n1. Preparing df_log...")
log_cols_to_keep = [
    'eventtime', 'fullpath', 'filedirectoryname', 
    'creationtime', 'modifiedtime', 'mftmodifiedtime', 'accessedtime', 
    'lsn', 'event', 'redo', 'targetvcn', 'clusterindex', 'missingfullpathflaglsn'
]
df_log_std = df_log[log_cols_to_keep].copy()
df_log_std.rename(columns={'eventtime': 'timestamp_primary'}, inplace=True)
df_log_std['source'] = 'LogFile'
# Add UsnJrnl-specific columns with NaNs for alignment
df_log_std['usn'] = np.nan 
df_log_std['eventinfo'] = np.nan; df_log_std['fileattribute'] = np.nan; df_log_std['filereferencenumber'] = np.nan
df_log_std['parentfilereferencenumber'] = np.nan; df_log_std['missingfullpathflagusn'] = np.nan


# --- 2. PREP: UsnJrnl (df_usn) ---
print("2. Preparing df_usn...")
usn_cols_to_keep = [
    'timestamp', 'fullpath', 'filedirectoryname',
    'usn', 'eventinfo', 'fileattribute', 'filereferencenumber', 
    'parentfilereferencenumber', 'missingfullpathflagusn' 
]
df_usn_std = df_usn[usn_cols_to_keep].copy()
df_usn_std.rename(columns={'timestamp': 'timestamp_primary'}, inplace=True)
df_usn_std['source'] = 'UsnJrnl'
# Add LogFile-specific columns with NaTs/NaNs for alignment
df_usn_std['creationtime'] = pd.NaT; df_usn_std['modifiedtime'] = pd.NaT; df_usn_std['mftmodifiedtime'] = pd.NaT; df_usn_std['accessedtime'] = pd.NaT
df_usn_std['lsn'] = np.nan; df_usn_std['event'] = np.nan; df_usn_std['redo'] = np.nan; df_usn_std['targetvcn'] = np.nan
df_usn_std['clusterindex'] = np.nan; df_usn_std['missingfullpathflaglsn'] = np.nan


# --- 3. Vertical Concatenation & Sort ---
print("3. Concatenating DataFrames...")
df_master = pd.concat([df_log_std, df_usn_std], ignore_index=True)

print("4. Sorting Master Timeline...")
df_master.sort_values(by='timestamp_primary', inplace=True)

# --- 5. DATA TYPE CLEANUP (NEW INTEGRATED STEP) ---
print("\n5. Cleaning up integer columns (removing .000)...")

INT_COLS_TO_CLEAN = [
    'lsn', 'clusterindex', 'missingfullpathflaglsn', 
    'usn', 'missingfullpathflagusn'
]

for col in INT_COLS_TO_CLEAN:
    if col in df_master.columns and df_master[col].dtype == 'float64':
        # Convert to 'Int64' (capital I) to allow for NaN/missing values
        df_master[col] = df_master[col].astype('Int64')
        
print("✅ Integer cleanup complete.")


# --- Final Output ---
print("\n--- Master Timeline Creation Complete! ---")
print(f"Master DataFrame Shape: {df_master.shape}")
print(f"Total Rows: {len(df_master)}")
print(f"Total Columns: {len(df_master.columns)}")

print("\nFirst 5 rows sorted chronologically (confirming clean integers):")
display(df_master.head())

--- Starting Vertical Concatenation to Master Timeline ---

1. Preparing df_log...
2. Preparing df_usn...
3. Concatenating DataFrames...
4. Sorting Master Timeline...

5. Cleaning up integer columns (removing .000)...
✅ Integer cleanup complete.

--- Master Timeline Creation Complete! ---
Master DataFrame Shape: (267443, 20)
Total Rows: 267443
Total Columns: 20

First 5 rows sorted chronologically (confirming clean integers):


Unnamed: 0,timestamp_primary,fullpath,filedirectoryname,creationtime,modifiedtime,mftmodifiedtime,accessedtime,lsn,event,redo,targetvcn,clusterindex,missingfullpathflaglsn,source,usn,eventinfo,fileattribute,filereferencenumber,parentfilereferencenumber,missingfullpathflagusn
2943,2019-12-07 17:14:52,\Windows\Microsoft.NET\Framework64\v4.0.30319,v4.0.30319,2019-12-07 17:14:52,2023-12-31 01:20:18,2023-12-31 01:20:18,2023-12-31 01:20:18,10588393573,Updating Modified Time,Update Resident Value,0x36C,4,0,LogFile,,,,,,
2940,2019-12-07 17:14:52,\ProgramData\regid.1991-06.com.microsoft,regid.1991-06.com.microsoft,2019-12-07 17:14:52,2023-12-28 22:42:03,2023-12-28 22:42:03,2023-12-28 22:42:03,10588391841,Updating Modified Time,Update Resident Value,0x187,4,0,LogFile,,,,,,
2942,2019-12-07 17:14:52,\Windows\Microsoft.NET\Framework\v4.0.30319,v4.0.30319,2019-12-07 17:14:52,2023-12-31 01:20:17,2023-12-31 01:20:17,2023-12-31 01:20:17,10588393501,Updating Modified Time,Update Resident Value,0x362,4,0,LogFile,,,,,,
2941,2022-09-08 11:13:03,\ProgramData\USOShared\Logs\User,User,2022-09-08 11:13:03,2023-12-31 01:15:25,2023-12-31 01:15:25,2023-12-31 01:15:25,10588391901,Updating Modified Time,Update Resident Value,0x189,2,0,LogFile,,,,,,
2949,2022-12-16 16:12:44,\Windows\Prefetch,Prefetch,2022-12-16 16:12:44,2023-12-31 01:21:10,2023-12-31 01:21:10,2023-12-31 01:21:10,10588447680,Updating Modified Time,Update Resident Value,0x6586,2,0,LogFile,,,,,,


## 4.1 Data Merging Output Explanation 

We identified columns common to both DataFrames and columns unique to each, ensuring a consistent structure in the final `df_master` with **20 columns**.

For reference, the initial rows and columns of the cleaned LogFile and UsnJrnl respectively are as follows: 

| Source | Rows | Colums| 
| :--- | :--- | :--- |
| **`LogFile-Cleaned.csv`** | `8603` | `13` | 
| **`UsnJrnl-Cleaned.csv`** | `316817` | `10` | 

Here is a breakdown of the 20 columns and how they were merged:

### Columns Merged in `df_master`

The 20 columns in the final Master Timeline (`df_master`) can be categorized based on their origin:

#### 1. Unified Timeline Columns = 3

These columns existed in both original DataFrames and were **standardized** to provide the core context for the unified timeline.

| Column Name | Source 1 (`df_log`) | Source 2 (`df_usn`) | Purpose |
| :--- | :--- | :--- | :--- |
| **`timestamp_primary`** | `eventtime` | `timestamp` | The **single chronological anchor** used for sorting all events. |
| **`fullpath`** | `fullpath` | `fullpath` | The full path to the file; critical for **grouping events by file**. |
| **`filedirectoryname`** | `filedirectoryname` | `filedirectoryname` | The name of the file or directory; used for contextual grouping. |

#### 2. LogFile (NTFS Journal) Specific Columns = 9 

These columns contain low-level NTFS operation details. They hold **actual values** for `LogFile` records and are filled with **`<NA>`** or **`NaN`** for `UsnJrnl` records.

| Column Name | Origin | Data Type | Purpose |
| :--- | :--- | :--- | :--- |
| **`creationtime`** | `creationtime` | `NaT` | File's recorded creation timestamp. |
| **`modifiedtime`** | `modifiedtime` | `NaT` | File's recorded modification timestamp. |
| **`mftmodifiedtime`**| `mftmodifiedtime` | `NaT` | File's recorded MFT entry modification timestamp. |
| **`accessedtime`** | `accessedtime` | `NaT` | File's recorded access timestamp. |
| **`lsn`** | `lsn` | `<NA>` | The **Log Sequence Number**, a unique, ordered ID for NTFS journal events. |
| **`event`** | `event` | `NaN` | The NTFS event type (e.g., 'File Creation'). |
| **`redo`** | `redo` | `NaN` | The NTFS redo operation command. |
| **`targetvcn`** | `targetvcn` | `NaN` | Target Virtual Cluster Number. |
| **`clusterindex`** | `clusterindex` | `<NA>` | Cluster index value. |
| **`missingfullpathflaglsn`**| `missing_fullpath_flag` | `<NA>` | Flag indicating if the full path was missing/reconstructed in the LSN analysis. |

#### 3. UsnJrnl (Change Journal) Specific Columns = 6

These columns contain high-level change journal details. They hold **actual values** for `UsnJrnl` records and are filled with **`<NA>`** or **`NaN`** for `LogFile` records.

| Column Name | Origin | Data Type | Purpose |
| :--- | :--- | :--- | :--- |
| **`usn`** | `usn` | `<NA>` | **Update Sequence Number**, the primary event ID for the UsnJrnl. |
| **`eventinfo`** | `eventinfo` | `NaN` | High-level file action (e.g., `DATA_OVERWRITE`, `FILE_RENAME_OLD_NAME`). |
| **`fileattribute`** | `fileattribute` | `NaN` | File attributes (e.g., archive, system, hidden). |
| **`filereferencenumber`**| `filereferencenumber` | `NaN` | Unique MFT ID of the file. |
| **`parentfilereferencenumber`**| `parentfilereferencenumber` | `NaN` | Unique MFT ID of the file's parent directory. |
| **`missingfullpathflagusn`**| `missing_fullpath_flag_usn` | `<NA>` | Flag indicating if the full path was missing/reconstructed in the USN analysis. |

#### 4. Source Flag = 1

| Column Name | Origin | Data Type | Purpose |
| :--- | :--- | :--- | :--- |
| **`source`** | Calculated | `object` | Distinguishes whether the record came from **`LogFile`** or **`UsnJrnl`**. |

# 5. Exporting the Merged CSV to file directory data/processed/phase 2 - data merged

In [6]:
# Filename should be [SubFolder]-PE-Merged.csv
# Define the output directory and filename
OUTPUT_DIR = 'data/processed/phase 2 - data merged'
OUTPUT_FILENAME = '11-PE-Merged.csv'
OUTPUT_FILEPATH = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)

# --- 1. Ensure the output directory exists ---
# This is a crucial step to prevent FileNotFoundError
try:
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Created directory (if necessary): {OUTPUT_DIR}")
except Exception as e:
    # Print error but continue, as the directory might exist or be handled by the environment
    print(f"Warning: Could not create directory {OUTPUT_DIR}. Attempting export anyway. Error: {e}")


# --- 2. Export the DataFrame to CSV ---
# We use date_format to ensure all datetime columns are written in a consistent, readable format.
# We set index=False because the index is not needed.
try:
    df_master.to_csv(
        OUTPUT_FILEPATH, 
        index=False, 
        encoding='utf-8', 
        date_format='%Y-%m-%d %H:%M:%S' 
    )
    print(f"\n✅ Merged Cleaned LogFile and UsnJrnl Successfully exported to and is ready for Phase 2.1 - Merging All Sub-Folder :")
    print(f"   {OUTPUT_FILEPATH}")

except NameError:
    print("\nERROR: The DataFrame 'df_master' was not found.")
    print("Please ensure the entire Phase 2 merging block was run before attempting to export.")
except Exception as e:
    print(f"\nFATAL ERROR during export: {e}")


Created directory (if necessary): data/processed/phase 2 - data merged

✅ Merged Cleaned LogFile and UsnJrnl Successfully exported to and is ready for Phase 2.1 - Merging All Sub-Folder :
   data/processed/phase 2 - data merged/11-PE-Merged.csv
