In [1]:
import pandas as pd
import numpy as np
import os
import bz2
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

class CANBusML:
    def __init__(self, window_size=10):
        self.window_size = window_size

    def load_data(self, file_path):
        """Load and preprocess CAN bus data."""
        extracted_file_path = file_path.replace(".bz2", "")
        
        if file_path.endswith(".bz2"):
            with bz2.BZ2File(file_path, "rb") as fr, open(extracted_file_path, "wb") as fw:
                fw.write(fr.read())
            file_path = extracted_file_path

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Error: File '{file_path}' not found.")

        data = []
        with open(file_path, "r") as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) < 3:
                    continue  
                try:
                    timestamp = float(parts[0].strip("()"))
                    can_id, payload = parts[2].split("#")

                    data.append({
                        "timestamp": timestamp,
                        "datetime": datetime.fromtimestamp(timestamp),
                        "can_id": can_id,
                        "payload": payload,
                        "payload_length": len(payload),
                    })
                except (ValueError, IndexError):
                    continue  

        df = pd.DataFrame(data)
        print("✅ Data Loaded Successfully!")
        print(df.head())
        return df

    def preprocess_data(self, df):
        """Feature Engineering"""
        df['traffic_type'] = np.where(df['payload_length'] > 50, 1, 0)  
        df["time_diff"] = df["timestamp"].diff().fillna(0)  
        df['rolling_payload'] = df['payload_length'].rolling(window=self.window_size, min_periods=1).mean()

        label_encoder = LabelEncoder()
        df['can_id'] = label_encoder.fit_transform(df['can_id'])  

        scaler = StandardScaler()
        df[['payload_length', 'time_diff', 'rolling_payload']] = scaler.fit_transform(df[['payload_length', 'time_diff', 'rolling_payload']])

        return df.dropna()

    def split_data(self, df):
        """Train-Test Split"""
        features = ['can_id', 'payload_length', 'time_diff', 'rolling_payload']
        X = df[features]
        y = df['traffic_type']

        return train_test_split(X, y, test_size=0.2, random_state=42)
# Random_Forest 
    
    def train_random_forest(self, X_train, y_train, X_test, y_test):
        """Train Random Forest"""
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"🎯 Random Forest Accuracy: {accuracy:.4f}")
        return model
# XGBoost 

    def train_xgboost(self, X_train, y_train, X_test, y_test):
        """Train XGBoost"""
        model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"🚀 XGBoost Accuracy: {accuracy:.4f}")
        return model
# Long Short-Term Memory (LSTM)
    def train_lstm(self, X_train, y_train, X_test, y_test):
        """Train LSTM for sequence prediction"""
        X_train = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
        X_test = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))

        model = Sequential([
            LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
            Dropout(0.2),
            LSTM(50),
            Dropout(0.2),
            Dense(1, activation="sigmoid")
        ])

        model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
        model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

        loss, accuracy = model.evaluate(X_test, y_test)
        print(f"🔥 LSTM Accuracy: {accuracy:.4f}")
        return model

# 🏁 Run Pipeline
pipeline = CANBusML(window_size=10)
file_path = "/workspaces/myfolder/full_data_capture.log.bz2"

df = pipeline.load_data(file_path)
df = pipeline.preprocess_data(df)
X_train, X_test, y_train, y_test = pipeline.split_data(df)

rf_model = pipeline.train_random_forest(X_train, y_train, X_test, y_test)
xgb_model = pipeline.train_xgboost(X_train, y_train, X_test, y_test)
lstm_model = pipeline.train_lstm(X_train, y_train, X_test, y_test)


2025-02-16 06:42:47.419530: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-16 06:42:47.423381: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-16 06:42:47.435308: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739688167.454936    1804 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739688167.460952    1804 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-16 06:42:47.482305: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

✅ Data Loaded Successfully!
      timestamp                   datetime can_id           payload  \
0  1.536574e+09 2018-09-10 10:06:04.242068    1C8  83FF0000FFFE3BFF   
1  1.536574e+09 2018-09-10 10:06:04.242212    1E9  0000000E00010000   
2  1.536574e+09 2018-09-10 10:06:04.242485    232  0000000000000000   
3  1.536574e+09 2018-09-10 10:06:04.242641    348        000000001B   
4  1.536574e+09 2018-09-10 10:06:04.242807    34A        000000001B   

   payload_length  
0              16  
1              16  
2              16  
3              10  
4              10  
🎯 Random Forest Accuracy: 1.0000


Parameters: { "use_label_encoder" } are not used.



🚀 XGBoost Accuracy: 1.0000
Epoch 1/5


2025-02-16 06:43:22.741393: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(**kwargs)


[1m67252/67252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 5ms/step - accuracy: 0.9997 - loss: 0.0018 - val_accuracy: 1.0000 - val_loss: 8.7700e-12
Epoch 2/5
[1m67252/67252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 5ms/step - accuracy: 1.0000 - loss: 4.5635e-11 - val_accuracy: 1.0000 - val_loss: 3.0851e-12
Epoch 3/5
[1m67252/67252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 5ms/step - accuracy: 1.0000 - loss: 2.1211e-11 - val_accuracy: 1.0000 - val_loss: 1.7925e-12
Epoch 4/5
[1m67252/67252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 5ms/step - accuracy: 1.0000 - loss: 1.4422e-11 - val_accuracy: 1.0000 - val_loss: 1.2412e-12
Epoch 5/5
[1m67252/67252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 5ms/step - accuracy: 1.0000 - loss: 1.0480e-11 - val_accuracy: 1.0000 - val_loss: 9.4489e-13
[1m16813/16813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 2ms/step - accuracy: 1.0000 - loss: 9.4360e-13
🔥 LSTM Accuracy: 1.00