#### **Library Imports**

In [15]:
### Data Ingestion
import tempfile
from pathlib import Path
import py7zr

# Data Manipulation
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [74]:
ROOT = Path.cwd().parent
path = ROOT / "data" / "nov_data" / "_parquet_cache" / "5FT0217"

In [None]:
# Grab two parquet files for exploration
files = sorted(path.glob("2025-11-0*.parquet"))[:2]

In [None]:
# Validate paths
files

[PosixPath('/Users/parisheard/Documents/umich/courses/mastery_project/data/nov_data/_parquet_cache/5FT0217/2025-11-01.parquet'),
 PosixPath('/Users/parisheard/Documents/umich/courses/mastery_project/data/nov_data/_parquet_cache/5FT0217/2025-11-02.parquet')]

In [None]:
# Extract date from filename and concatenate DataFrames
dfs = []
for f in files:
    df = pd.read_parquet(f)
    df['Date'] = f.stem.split("_")[0]  # Extract date from filename
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df['Date'] = pd.to_datetime(df['Date'])

In [80]:
df.head()

Unnamed: 0,Timestamp,EEC1_Engine.ActualEngPercentTorque,EEC1_Engine.DriversDemandEngPercentTorque,EEC1_Engine.EngSpeed,EEC1_Engine.SrcAddrssOfCntrllngDvcForEngCtrl,LFE_Engine.EngFuelRate,EC1_Engine.EngReferenceTorque,EEC2_Engine.AccelPedalPos1,EEC2_Engine.ActlMaxAvailableEngPercentTorque,CCVS1_Engine.BrakeSwitch,...,TTC0016_FOC,TTC0017_FMI,TTC0017_FOC,NavSatNum,JammingSt,cwSuppress,HDOP,GpsReset,AutoMountState,Date
0,32228.863,0.0,0.0,1387.75,11.0,0.0,2386.0,0.0,44.0,0.0,...,0.0,7.0,3.0,28.0,0.0,21.0,0.75,0.0,0.0,2025-11-01
1,32228.963,0.0,0.0,1388.0,11.0,0.0,2386.0,0.0,44.0,0.0,...,0.0,7.0,3.0,28.0,0.0,21.0,0.75,0.0,0.0,2025-11-01
2,32229.064,0.0,0.0,1388.25,11.0,0.0,2386.0,0.0,44.0,0.0,...,0.0,7.0,3.0,28.0,0.0,21.0,0.75,0.0,0.0,2025-11-01
3,32229.164,0.0,0.0,1385.75,11.0,0.0,2386.0,0.0,44.0,0.0,...,0.0,7.0,3.0,28.0,0.0,21.0,0.75,0.0,0.0,2025-11-01
4,32229.264,0.0,0.0,1385.5,11.0,0.0,2386.0,0.0,44.0,0.0,...,0.0,7.0,3.0,28.0,0.0,21.0,0.75,0.0,0.0,2025-11-01


In [None]:
# Sort by Date and Timestamp
df = df.sort_values(['Date', 'Timestamp']).reset_index(drop=True)

In [86]:
# Drop all rows with NaN values
df = df.dropna()

In [87]:
df.head()

Unnamed: 0,Timestamp,EEC1_Engine.ActualEngPercentTorque,EEC1_Engine.DriversDemandEngPercentTorque,EEC1_Engine.EngSpeed,EEC1_Engine.SrcAddrssOfCntrllngDvcForEngCtrl,LFE_Engine.EngFuelRate,EC1_Engine.EngReferenceTorque,EEC2_Engine.AccelPedalPos1,EEC2_Engine.ActlMaxAvailableEngPercentTorque,CCVS1_Engine.BrakeSwitch,...,TTC0016_FOC,TTC0017_FMI,TTC0017_FOC,NavSatNum,JammingSt,cwSuppress,HDOP,GpsReset,AutoMountState,Date
0,2.804,54.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,...,0.0,255.0,0.0,24.0,0.0,15.0,0.64,0.0,0.0,2025-11-01
1,2.904,54.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,...,0.0,255.0,0.0,24.0,0.0,15.0,0.64,0.0,0.0,2025-11-01
2,3.004,54.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,...,0.0,255.0,0.0,24.0,0.0,15.0,0.64,0.0,0.0,2025-11-01
3,3.104,54.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,...,0.0,255.0,0.0,24.0,0.0,15.0,0.64,0.0,0.0,2025-11-01
4,3.2,7.0,0.0,651.25,0.0,2.7,2386.0,0.0,48.8,0.0,...,0.0,255.0,0.0,22.0,0.0,0.0,0.76,0.0,0.0,2025-11-01


In [88]:
df.tail()

Unnamed: 0,Timestamp,EEC1_Engine.ActualEngPercentTorque,EEC1_Engine.DriversDemandEngPercentTorque,EEC1_Engine.EngSpeed,EEC1_Engine.SrcAddrssOfCntrllngDvcForEngCtrl,LFE_Engine.EngFuelRate,EC1_Engine.EngReferenceTorque,EEC2_Engine.AccelPedalPos1,EEC2_Engine.ActlMaxAvailableEngPercentTorque,CCVS1_Engine.BrakeSwitch,...,TTC0016_FOC,TTC0017_FMI,TTC0017_FOC,NavSatNum,JammingSt,cwSuppress,HDOP,GpsReset,AutoMountState,Date
925253,9833.438,130.0,0.0,609.25,0.0,0.0,2386.0,0.0,0.0,0.0,...,0.0,255.0,0.0,29.0,0.0,17.0,0.75,0.0,0.0,2025-11-02
925254,9833.537,130.0,0.0,580.25,0.0,0.0,2386.0,0.0,0.0,0.0,...,0.0,255.0,0.0,29.0,0.0,17.0,0.75,0.0,0.0,2025-11-02
925255,9833.638,130.0,0.0,551.25,0.0,0.0,2386.0,0.0,0.0,0.0,...,0.0,255.0,0.0,29.0,0.0,17.0,0.75,0.0,0.0,2025-11-02
925256,9833.737,130.0,0.0,519.0,0.0,0.0,2386.0,0.0,0.0,0.0,...,0.0,255.0,0.0,29.0,0.0,17.0,0.75,0.0,0.0,2025-11-02
925257,9833.838,130.0,0.0,487.0,0.0,0.0,2386.0,0.0,0.0,0.0,...,0.0,255.0,0.0,29.0,0.0,17.0,0.75,0.0,0.0,2025-11-02


In [92]:
print(f'Dates: {df["Date"].unique()}')

Dates: <DatetimeArray>
['2025-11-01 00:00:00', '2025-11-02 00:00:00']
Length: 2, dtype: datetime64[ns]


In [94]:
ts = df['Timestamp']
diffs = ts.diff()

print('Basic Stats (in seconds):')
print(diffs.describe())

print('\nQuantiles:')
print(diffs.quantile([0.5, 0.9, 0.99, 0.999]))

print('\nNegative Diffs (Resets):', ( diffs < 0 ).sum())
print('Zero Diffs (Duplicates):', ( diffs == 0 ).sum())
print('Positive Diffs (Normal):', ( diffs > 0 ).sum())
print('Diff Gap (>60s):', ( diffs > 60 ).sum())

Basic Stats (in seconds):
count    925244.000000
mean          0.010625
std          50.317980
min      -46345.876000
25%           0.011000
50%           0.034000
75%           0.087000
max       13952.726000
Name: Timestamp, dtype: float64

Quantiles:
0.500    0.034
0.900    0.101
0.990    0.102
0.999    0.102
Name: Timestamp, dtype: float64

Negative Diffs (Resets): 1
Zero Diffs (Duplicates): 13950
Positive Diffs (Normal): 911293
Diff Gap (>60s): 1


Locating the Negative Differential

In [97]:
neg_idx = df['Timestamp'].diff().lt(0)
df.loc[neg_idx, ['Date', 'Timestamp']].head()

Unnamed: 0,Date,Timestamp
580959,2025-11-02,3.597


In [99]:
prev_idx = df['Timestamp'].diff().idxmin()
df.loc[prev_idx-5:prev_idx+5, ['Date', 'Timestamp']]

Unnamed: 0,Date,Timestamp
580954,2025-11-01,46349.371
580955,2025-11-01,46349.473
580959,2025-11-02,3.597
580960,2025-11-02,3.605
580961,2025-11-02,3.697
580962,2025-11-02,3.705
580963,2025-11-02,3.797
580964,2025-11-02,3.805


Locating the Positive Differential

In [103]:
pos_idx = df['Timestamp'].diff().gt(1000)
df.loc[pos_idx, ['Date', 'Timestamp']].head()

Unnamed: 0,Date,Timestamp
440108,2025-11-01,32228.863


In [105]:
prev_idx = df['Timestamp'].diff().idxmax()
df.loc[prev_idx-5:prev_idx+5, ['Date', 'Timestamp']]

Unnamed: 0,Date,Timestamp
440103,2025-11-01,18275.736
440104,2025-11-01,18275.836
440105,2025-11-01,18275.936
440106,2025-11-01,18276.037
440107,2025-11-01,18276.137
440108,2025-11-01,32228.863
440109,2025-11-01,32228.963
440110,2025-11-01,32229.064
440111,2025-11-01,32229.164
440112,2025-11-01,32229.264


Analysis of timestamp first differences confirms that the data is sampled at approximately 10Hz (99th percentile: 0.102s), indicating a stable high-frequency logging under normal operation.

Two distinct dicontinuity behaviors were isolated and identified, along with insights from the client:

1. **Negative Timestamp Differences (Decrease)**
These events occur at day boundaries (presumably midnight, relative to timezone considerations), and represent system resets where the timestamp restarts from a low value. This aligns with day transitions or system reinitialization across files.

2. **Large Positive Timestamp Differences (Increase)**
These events occur, presumably, within the same calendar day and were confirmed by the client to represent system reboots as a driver restarts their vehicle. These events do not reset the timestamp to zero, but instead resume a new, larger baseline value.

To robustly segment data into continuous operational sessions ("time periods"), both conditions must be treated as hard boundaries with established limits. Given the 10Hz sampling rate in this consideration, any timestamp difference significantly larger than expected sampling jitter safely indicates a reboot or discontinuity.

All downstream event detection and context window extraction should be constrained within these inferred time periods to prevent stitching together unrelated driving sessions, and ensure simultaneously sessions remain connnected.

Override

In [None]:
# Throttle Override Status - 6: Override Active, Else: iQC Active
df['iQC1.iQCMode'].value_counts().sort_index()

In [None]:
# Brake Switch Status - 0: Not Pressed, 1: Pressed
df['CCVS1_Cab_Controller.BrakeSwitch'].value_counts()

In [None]:
# Brake Switch Status - 0: Not Pressed, 1: Pressed
df['CCVS1_Engine.BrakeSwitch'].value_counts()

#### Override Events
1. Throttle override
2. Brake override
3. Cruise disengage override
4. Speed governer / iQC override
5. PCC system override

In [None]:
columns = ['override_id', 'override_timestamp', 'override_type', 'cruise_active_pre', 'vehicle_speed', 'driver_input_type', 'system_override_flag']
overrides = pd.DataFrame(columns=columns)