In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import re

In [2]:
df = pd.read_csv('/kaggle/input/car-hacking-dataset/RPM_dataset.csv')
df.columns = ['Timestamp', 'CAN ID', 'DLC', 'DATA0', 'DATA1', 'DATA2', 'DATA3', 'DATA4', 'DATA5', 'DATA6', 'DATA7', 'Flag']
df.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1478191000.0,018f,8,fe,3b,00,00,00,3c,00,00,R
1,1478191000.0,0260,8,19,22,22,30,ff,8f,6e,3f,R
2,1478191000.0,02a0,8,60,00,83,1d,96,02,bd,00,R
3,1478191000.0,0329,8,dc,b8,7e,14,11,20,00,14,R
4,1478191000.0,0545,8,d8,00,00,83,00,00,00,00,R


In [3]:
# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1970-01-01 00:00:01.478191030,018f,8,fe,3b,00,00,00,3c,00,00,R
1,1970-01-01 00:00:01.478191030,0260,8,19,22,22,30,ff,8f,6e,3f,R
2,1970-01-01 00:00:01.478191030,02a0,8,60,00,83,1d,96,02,bd,00,R
3,1970-01-01 00:00:01.478191030,0329,8,dc,b8,7e,14,11,20,00,14,R
4,1970-01-01 00:00:01.478191030,0545,8,d8,00,00,83,00,00,00,00,R


In [4]:
df.nunique()

Timestamp    2436
CAN ID         26
DLC             2
DATA0         113
DATA1          85
DATA2          89
DATA3          28
DATA4         192
DATA5         256
DATA6          80
DATA7         256
Flag            2
dtype: int64

In [5]:
df_2 = df.copy()

In [6]:
# Define columns to check (DATA0-DATA7)
data_columns = ['DATA0', 'DATA1', 'DATA2', 'DATA3', 'DATA4', 'DATA5', 'DATA6', 'DATA7']

# Create regex pattern for valid hex
hex_pattern = r'^[0-9A-Fa-f]{2}$'

# Check for non-hex values
mask = df_2[data_columns].apply(lambda col: ~col.str.match(hex_pattern, na=False))

# Get rows with any invalid entries
invalid_rows = df_2[mask.any(axis=1)]

# Show results
print("Rows with non-hex values in DATA columns:")
print(invalid_rows if not invalid_rows.empty else "No non-hex values found")

Rows with non-hex values in DATA columns:
                            Timestamp CAN ID  DLC DATA0 DATA1 DATA2 DATA3  \
42      1970-01-01 00:00:01.478191030   05f0    2    01    00     R   NaN   
134     1970-01-01 00:00:01.478191030   05f0    2    01    00     R   NaN   
227     1970-01-01 00:00:01.478191030   05f0    2    01    00     R   NaN   
319     1970-01-01 00:00:01.478191030   05f0    2    01    00     R   NaN   
412     1970-01-01 00:00:01.478191030   05f0    2    01    00     R   NaN   
...                               ...    ...  ...   ...   ...   ...   ...   
4621281 1970-01-01 00:00:01.478201208   05f0    2    01    00     R   NaN   
4621373 1970-01-01 00:00:01.478201208   05f0    2    01    00     R   NaN   
4621466 1970-01-01 00:00:01.478201208   05f0    2    01    00     R   NaN   
4621558 1970-01-01 00:00:01.478201208   05f0    2    01    00     R   NaN   
4621651 1970-01-01 00:00:01.478201209   05f0    2    01    00     R   NaN   

        DATA4 DATA5 DATA6 DATA7 F

In [7]:
# Define validation patterns
can_id_pattern = r'^[0-9A-Fa-f]{4}$'  # 4-character hex

# Check CAN ID column
can_id_mask = ~df_2['CAN ID'].str.match(can_id_pattern, na=False)
invalid_can_id = df_2[can_id_mask]

# Display results
print("=== Invalid CAN IDs ===")
print(invalid_can_id if not invalid_can_id.empty else "All CAN IDs are valid hex")

# Optional: Show specific invalid values
if not invalid_can_id.empty:
    print("\nInvalid CAN ID entries:")
    print(df_2.loc[can_id_mask, 'CAN ID'])


=== Invalid CAN IDs ===
All CAN IDs are valid hex


In [8]:
df_3 = df_2.copy()

In [9]:
df_3.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1970-01-01 00:00:01.478191030,018f,8,fe,3b,00,00,00,3c,00,00,R
1,1970-01-01 00:00:01.478191030,0260,8,19,22,22,30,ff,8f,6e,3f,R
2,1970-01-01 00:00:01.478191030,02a0,8,60,00,83,1d,96,02,bd,00,R
3,1970-01-01 00:00:01.478191030,0329,8,dc,b8,7e,14,11,20,00,14,R
4,1970-01-01 00:00:01.478191030,0545,8,d8,00,00,83,00,00,00,00,R


In [10]:
# Function to convert hex to decimal
def hex_to_int(hex_str: str) -> int:
    try:
        return hex_str  # Convert hex to int
    except ValueError:
        return np.nan 

# Convert all DATA columns
for col in df_3.columns[1:-1]:  # Exclude 'Flag' column
    df_3[col] = df_3[col].apply(hex_to_int)

In [11]:
df_4 = df_3.copy()

In [12]:
df_4.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1970-01-01 00:00:01.478191030,018f,8,fe,3b,00,00,00,3c,00,00,R
1,1970-01-01 00:00:01.478191030,0260,8,19,22,22,30,ff,8f,6e,3f,R
2,1970-01-01 00:00:01.478191030,02a0,8,60,00,83,1d,96,02,bd,00,R
3,1970-01-01 00:00:01.478191030,0329,8,dc,b8,7e,14,11,20,00,14,R
4,1970-01-01 00:00:01.478191030,0545,8,d8,00,00,83,00,00,00,00,R


In [13]:
df_4.isnull().sum()


Timestamp        0
CAN ID           0
DLC              0
DATA0            0
DATA1            0
DATA2            0
DATA3        41476
DATA4        41476
DATA5        41476
DATA6        41476
DATA7        41476
Flag         41476
dtype: int64

In [14]:
df_5 = df_4.dropna().copy()
df_5.shape

(4580225, 12)

In [15]:
df_5.isnull().sum()

Timestamp    0
CAN ID       0
DLC          0
DATA0        0
DATA1        0
DATA2        0
DATA3        0
DATA4        0
DATA5        0
DATA6        0
DATA7        0
Flag         0
dtype: int64

In [16]:
# Define columns to check (DATA0-DATA7)
data_columns = ['DATA0', 'DATA1', 'DATA2', 'DATA3', 'DATA4', 'DATA5', 'DATA6', 'DATA7']

# Create regex pattern for valid hex
hex_pattern = r'^[0-9A-Fa-f]{2}$'

# Check for non-hex values
mask = df_5[data_columns].apply(lambda col: ~col.str.match(hex_pattern, na=False))

# Get rows with any invalid entries
invalid_rows = df_5[mask.any(axis=1)]

# Show results
print("Rows with non-hex values in DATA columns:")
print(invalid_rows if not invalid_rows.empty else "No non-hex values found")

Rows with non-hex values in DATA columns:
No non-hex values found


In [17]:
df_6 = df_5.copy()

In [18]:
df_6.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1970-01-01 00:00:01.478191030,018f,8,fe,3b,00,00,00,3c,00,00,R
1,1970-01-01 00:00:01.478191030,0260,8,19,22,22,30,ff,8f,6e,3f,R
2,1970-01-01 00:00:01.478191030,02a0,8,60,00,83,1d,96,02,bd,00,R
3,1970-01-01 00:00:01.478191030,0329,8,dc,b8,7e,14,11,20,00,14,R
4,1970-01-01 00:00:01.478191030,0545,8,d8,00,00,83,00,00,00,00,R


In [19]:
# Function to convert hex to decimal
def hex_to_int(hex_str: str) -> int:
    try:
        return int(str(hex_str).strip(), 16)   # Convert hex to int
    except ValueError:
        return np.nan 

# Convert all DATA columns
for col in df_6.columns[1:-1]:  # Exclude 'Flag' column
    df_6[col] = df_6[col].apply(hex_to_int)

In [20]:
df_6.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1970-01-01 00:00:01.478191030,399,8,254,59,0,0,0,60,0,0,R
1,1970-01-01 00:00:01.478191030,608,8,25,34,34,48,255,143,110,63,R
2,1970-01-01 00:00:01.478191030,672,8,96,0,131,29,150,2,189,0,R
3,1970-01-01 00:00:01.478191030,809,8,220,184,126,20,17,32,0,20,R
4,1970-01-01 00:00:01.478191030,1349,8,216,0,0,131,0,0,0,0,R
