<a href="https://colab.research.google.com/github/remyaP12/labcycle/blob/main/vehicle94%25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import (
    silhouette_score, calinski_harabasz_score, davies_bouldin_score,
    mean_absolute_error, mean_squared_error, r2_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings('ignore')

In [2]:
SEQ_LEN = 30          # longer sequences: 20‚Äì40
TRAIN_RATIO = 0.8     # 80/20 split
MAX_EPOCHS = 5000
PATIENCE = 100
USE_SMOOTH_LOAD = True        # True = rolling mean target
USE_CLUSTER_AS_FEATURE = True # True = 1 global LSTM; False = per-cluster LSTMs

In [12]:
df = pd.read_csv('/content/OBD (1).csv')
features = {
    'rpm': 'Engine RPM(rpm)',
    'load': 'Engine Load(%)',
    'maf': 'Mass Air Flow Rate(g/s)',
    'throttle': 'Throttle Position(Manifold)(%)'
}
df_features = df[list(features.values())].copy()
print(f"Dataset: {df_features.shape} | Features: {list(features.values())}")

Dataset: (1356, 4) | Features: ['Engine RPM(rpm)', 'Engine Load(%)', 'Mass Air Flow Rate(g/s)', 'Throttle Position(Manifold)(%)']


In [13]:
# Optional: smooth Engine Load to reduce noise
if USE_SMOOTH_LOAD:
    df['Engine Load Smoothed'] = (
        df[features['load']].rolling(5, center=True).mean().bfill().ffill()
    )
    load_source_col = 'Engine Load Smoothed'
    print("‚úÖ Using smoothed Engine Load as target.")
else:
    load_source_col = features['load']
    print("‚úÖ Using raw Engine Load as target.")


‚úÖ Using smoothed Engine Load as target.


In [14]:
scaler_minmax = MinMaxScaler()
scaler_robust = RobustScaler()

df_features[[features['rpm'], features['maf']]] = scaler_minmax.fit_transform(
    df_features[[features['rpm'], features['maf']]]
)

# load from chosen column, then scaled
df_features[features['load']] = scaler_minmax.fit_transform(
    df[[load_source_col]]
)

df_features[features['throttle']] = scaler_robust.fit_transform(
    df_features[[features['throttle']]]
)

print("‚úÖ Scaling complete.")

‚úÖ Scaling complete.


In [15]:
X_cluster = df_features[list(features.values())].values
kmeans_final = KMeans(n_clusters=2, random_state=42, n_init=10)
clusters = kmeans_final.fit_predict(X_cluster)
df_features['cluster'] = clusters

# cluster sizes for diagnostics
unique, counts = np.unique(clusters, return_counts=True)
print("Cluster sizes:", dict(zip(unique, counts)))


Cluster sizes: {np.int32(0): np.int64(471), np.int32(1): np.int64(885)}


In [16]:
def create_sequences(data, seq_length=30, target_idx=1):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        xs.append(data[i:i+seq_length])
        ys.append(data[i+seq_length, target_idx])
    return np.array(xs), np.array(ys)

# base feature matrix (rpm, load, maf, throttle)
base_seq_data = df_features[list(features.values())].values

# option: add cluster id as extra feature
if USE_CLUSTER_AS_FEATURE:
    seq_data_full = np.concatenate([base_seq_data, clusters.reshape(-1, 1)], axis=1)
    target_idx = list(features.values()).index(features['load'])  # still column 1
    print("‚úÖ Using cluster id as additional feature (single global LSTM).")
else:
    seq_data_full = base_seq_data
    target_idx = 1  # load column in the 4-feature matrix
    print("‚úÖ Will train separate LSTMs per cluster.")

X_all, y_all = create_sequences(seq_data_full, seq_length=SEQ_LEN, target_idx=target_idx)
sequence_clusters = clusters[SEQ_LEN:]

print(f"‚úÖ Sequences: X_all={X_all.shape}, y_all={y_all.shape}")

# global train/test split (80/20, time-ordered)
split_global = int(TRAIN_RATIO * len(X_all))
X_train_global, X_test_global = X_all[:split_global], X_all[split_global:]
y_train_global, y_test_global = y_all[:split_global], y_all[split_global:]
clusters_train_global = sequence_clusters[:split_global]
clusters_test_global = sequence_clusters[split_global:]


‚úÖ Using cluster id as additional feature (single global LSTM).
‚úÖ Sequences: X_all=(1326, 30, 5), y_all=(1326,)


In [17]:
def build_lstm(input_shape):
    model = Sequential([
        LSTM(76, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(76),
        Dropout(0.5),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.0002), loss='mse')
    return model

def build_gru(input_shape):
    model = Sequential([
        GRU(76, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        GRU(76),
        Dropout(0.5),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.0002), loss='mse')
    return model

def build_rnn(input_shape):
    model = Sequential([
        SimpleRNN(76, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        SimpleRNN(76),
        Dropout(0.5),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.0002), loss='mse')
    return model

def train_and_predict(build_fn, X_train, y_train, X_test):
    model = build_fn((X_train.shape[1], X_train.shape[2]))
    es = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=PATIENCE,
        restore_best_weights=True
    )
    model.fit(
        X_train, y_train,
        epochs=MAX_EPOCHS,
        batch_size=32,
        validation_split=0.1,
        callbacks=[es],
        verbose=0
    )
    y_pred = model.predict(X_test, verbose=0).ravel()
    return model, y_pred

In [19]:
print("\n‚è≥ Training LSTM-only...")
lstm_model, y_pred_lstm = train_and_predict(
    build_lstm, X_train_global, y_train_global, X_test_global
)

print("‚è≥ Training GRU...")
gru_model, y_pred_gru = train_and_predict(
    build_gru, X_train_global, y_train_global, X_test_global
)

print("‚è≥ Training RNN...")
rnn_model, y_pred_rnn = train_and_predict(
    build_rnn, X_train_global, y_train_global, X_test_global
)


‚è≥ Training LSTM-only...
‚è≥ Training GRU...
‚è≥ Training RNN...


In [20]:
if USE_CLUSTER_AS_FEATURE:
    # hybrid = single global LSTM, already trained above
    # use y_pred_lstm as "Proposed" predictions
    y_pred_hybrid = y_pred_lstm.copy()
    print("‚úÖ Hybrid = global LSTM with cluster feature.")
else:
    # true per-cluster hybrid
    cluster_models = {}
    for c in np.unique(sequence_clusters):
        idx_c = np.where(sequence_clusters == c)[0]
        X_c, y_c = X_all[idx_c], y_all[idx_c]
        if len(X_c) < 100:
            print(f"‚ö†Ô∏è Cluster {c} has only {len(X_c)} sequences; model may be weak.")
        split_c = int(TRAIN_RATIO * len(X_c))
        X_train_c, X_test_c = X_c[:split_c], X_c[split_c:]
        y_train_c, y_test_c = y_c[:split_c], y_c[split_c:]

        print(f"\n‚è≥ Training LSTM for cluster {c} (samples: {len(X_c)})...")
        model_c, _ = train_and_predict(
            build_lstm, X_train_c, y_train_c, X_test_c
        )
        cluster_models[c] = model_c

    y_pred_hybrid = []
    for seq, c in zip(X_test_global, clusters_test_global):
        model_c = cluster_models[c]
        y_hat = model_c.predict(seq[np.newaxis, ...], verbose=0)[0, 0]
        y_pred_hybrid.append(y_hat)
    y_pred_hybrid = np.array(y_pred_hybrid)
    print("‚úÖ Hybrid = per-cluster LSTMs.")


‚úÖ Hybrid = global LSTM with cluster feature.


In [21]:
def metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2

mae_l, mse_l, rmse_l, r2_l = metrics(y_test_global, y_pred_lstm)
mae_g, mse_g, rmse_g, r2_g = metrics(y_test_global, y_pred_gru)
mae_r, mse_r, rmse_r, r2_r = metrics(y_test_global, y_pred_rnn)
mae_h, mse_h, rmse_h, r2_h = metrics(y_test_global, y_pred_hybrid)

comparison = pd.DataFrame({
    'Model': ['LSTM only', 'GRU', 'RNN', 'Proposed (Hybrid)'],
    'MAE': [mae_l, mae_g, mae_r, mae_h],
    'MSE': [mse_l, mse_g, mse_r, mse_h],
    'RMSE (%)': [rmse_l * 100, rmse_g * 100, rmse_r * 100, rmse_h * 100],
    'R2 (%)': [r2_l * 100, r2_g * 100, r2_r * 100, r2_h * 100]
}).round(4)

print("\nüìä Model performance comparison (Engine Load target):")
print(comparison)


üìä Model performance comparison (Engine Load target):
               Model     MAE     MSE  RMSE (%)   R2 (%)
0          LSTM only  0.0442  0.0030    5.5107  94.1532
1                GRU  0.0402  0.0025    5.0288  95.1311
2                RNN  0.0404  0.0027    5.2207  94.7523
3  Proposed (Hybrid)  0.0442  0.0030    5.5107  94.1532


In [8]:
scaler_minmax = MinMaxScaler()
scaler_robust = RobustScaler()

df_features[[features['rpm'], features['maf']]] = scaler_minmax.fit_transform(
    df_features[[features['rpm'], features['maf']]]
)

In [10]:
# load from chosen column, then scaled
df_features[features['load']] = scaler_minmax.fit_transform(
    df[[load_source_col]]
)

df_features[features['throttle']] = scaler_robust.fit_transform(
    df_features[[features['throttle']]]
)

print("‚úÖ Scaling complete.")

# ============================================================
# 3. K-MEANS CLUSTERING (K=2)
# ============================================================
X_cluster = df_features[list(features.values())].values
kmeans_final = KMeans(n_clusters=2, random_state=42, n_init=10)
clusters = kmeans_final.fit_predict(X_cluster)
df_features['cluster'] = clusters

# cluster sizes for diagnostics
unique, counts = np.unique(clusters, return_counts=True)
print("Cluster sizes:", dict(zip(unique, counts)))

NameError: name 'load_source_col' is not defined