In [87]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Conv1D, Dense, Dropout, MaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings("ignore")



In [88]:
df = pd.read_csv("C:\\Users\\HP\\Downloads\\archive (3)\\smart_mobility_dataset.csv")

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df = df.sort_values("Timestamp")


In [89]:
df.head()

Unnamed: 0,Timestamp,Latitude,Longitude,Vehicle_Count,Traffic_Speed_kmh,Road_Occupancy_%,Traffic_Light_State,Weather_Condition,Accident_Report,Sentiment_Score,Ride_Sharing_Demand,Parking_Availability,Emission_Levels_g_km,Energy_Consumption_L_h,Traffic_Condition
0,2024-03-01 00:00:00,40.842275,-73.703149,205,49.893435,82.65278,Yellow,Clear,0,-0.609199,2,45,450.760055,19.574337,High
1,2024-03-01 00:05:00,40.831119,-73.987354,202,22.383965,45.829298,Green,Clear,0,0.965442,16,1,321.800341,5.385554,High
2,2024-03-01 00:10:00,40.819549,-73.732462,252,46.889699,82.772465,Green,Rain,0,0.28966,16,49,231.152655,10.277477,High
3,2024-03-01 00:15:00,40.725849,-73.980134,37,5.730536,37.695567,Red,Fog,0,-0.271965,66,10,410.384292,29.243279,High
4,2024-03-01 00:20:00,40.813265,-73.961631,64,61.348034,22.313358,Red,Snow,0,-0.797606,3,5,364.466342,16.801459,Low


In [90]:
df.isnull().sum()

Timestamp                 0
Latitude                  0
Longitude                 0
Vehicle_Count             0
Traffic_Speed_kmh         0
Road_Occupancy_%          0
Traffic_Light_State       0
Weather_Condition         0
Accident_Report           0
Sentiment_Score           0
Ride_Sharing_Demand       0
Parking_Availability      0
Emission_Levels_g_km      0
Energy_Consumption_L_h    0
Traffic_Condition         0
dtype: int64

In [91]:
df.shape

(5000, 15)

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Timestamp               5000 non-null   datetime64[ns]
 1   Latitude                5000 non-null   float64       
 2   Longitude               5000 non-null   float64       
 3   Vehicle_Count           5000 non-null   int64         
 4   Traffic_Speed_kmh       5000 non-null   float64       
 5   Road_Occupancy_%        5000 non-null   float64       
 6   Traffic_Light_State     5000 non-null   object        
 7   Weather_Condition       5000 non-null   object        
 8   Accident_Report         5000 non-null   int64         
 9   Sentiment_Score         5000 non-null   float64       
 10  Ride_Sharing_Demand     5000 non-null   int64         
 11  Parking_Availability    5000 non-null   int64         
 12  Emission_Levels_g_km    5000 non-null   float64 

In [93]:
# df.describe()

In [94]:
leakage_cols = [
    "Traffic_Speed_kmh",      
           
]

df = df.drop(columns=leakage_cols)

In [95]:
df["Hour"] = df["Timestamp"].dt.hour
df["Day"] = df["Timestamp"].dt.day
df["Weekday"] = df["Timestamp"].dt.weekday
df["Is_Weekend"] = (df["Weekday"] >= 5).astype(int)
df["Is_Rush_Hour"] = df["Hour"].isin([7,8,9,17,18,19]).astype(int)

In [96]:
df = pd.get_dummies(
    df,
    columns=["Traffic_Light_State", "Weather_Condition"],
    drop_first=True
)

In [97]:
target_col = "Traffic_Condition"


class_mapping = {name: idx for idx, name in enumerate(df[target_col].unique())}
df["Target_Int"] = df[target_col].map(class_mapping)

feature_cols = [c for c in df.columns if c not in ["Timestamp", target_col, "Target_Int"]]

X = df[feature_cols]
y = df["Target_Int"]


split_idx = int(0.8 * len(df))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")


Train size: (4000, 20), Test size: (1000, 20)


In [98]:
results = {}

def eval_and_store(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="macro")
    results[name] = (acc, f1)
    print(f"\n{name}")
    print("Accuracy:", acc)
    print("F1 (macro):", f1)
    print(classification_report(y_true, y_pred, target_names=class_mapping.keys()))

# 4.1 Logistic Regression
log_clf = LogisticRegression(max_iter=2000, multi_class="auto")
log_clf.fit(X_train, y_train)
eval_and_store("LogisticRegression", y_test, log_clf.predict(X_test))

# 4.2 Decision Tree
dt_clf = DecisionTreeClassifier(max_depth=12, random_state=42)
dt_clf.fit(X_train, y_train)
eval_and_store("DecisionTree", y_test, dt_clf.predict(X_test))

# 4.3 Random Forest
rf_clf = RandomForestClassifier(
   n_estimators=400,
    max_depth=12,
    min_samples_split=20,
    min_samples_leaf=10,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)
rf_clf.fit(X_train, y_train)
eval_and_store("RandomForest", y_test, rf_clf.predict(X_test))

# 4.4 Gradient Boosting 
gb_clf = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=300,
    min_samples_split=40,
    min_samples_leaf=20,
    max_leaf_nodes=8       
)

gb_clf.fit(X_train, y_train)
eval_and_store("GradientBoosting", y_test, gb_clf.predict(X_test))



LogisticRegression
Accuracy: 0.698
F1 (macro): 0.5465744705265356
              precision    recall  f1-score   support

        High       0.77      0.90      0.83       631
         Low       0.49      0.33      0.39        70
      Medium       0.50      0.36      0.42       299

    accuracy                           0.70      1000
   macro avg       0.59      0.53      0.55      1000
weighted avg       0.67      0.70      0.68      1000


DecisionTree
Accuracy: 0.844
F1 (macro): 0.7951356434488347
              precision    recall  f1-score   support

        High       0.92      0.87      0.90       631
         Low       0.60      0.89      0.72        70
      Medium       0.77      0.77      0.77       299

    accuracy                           0.84      1000
   macro avg       0.77      0.84      0.80      1000
weighted avg       0.85      0.84      0.85      1000


RandomForest
Accuracy: 0.851
F1 (macro): 0.6603165757188746
              precision    recall  f1-score   sup

In [99]:
scaler_dl = MinMaxScaler()
X_train_scaled = scaler_dl.fit_transform(X_train)
X_test_scaled = scaler_dl.transform(X_test)

SEQ_LEN = 10 # last 5 time-steps

def make_sequences(X_arr, y_arr, seq_len):
    Xs, ys = [], []
    for i in range(len(X_arr) - seq_len):
        Xs.append(X_arr[i:i+seq_len])
        ys.append(y_arr.iloc[i+seq_len])
    return np.array(Xs), np.array(ys)

X_seq_train, y_seq_train = make_sequences(X_train_scaled, y_train, SEQ_LEN)
X_seq_test, y_seq_test = make_sequences(X_test_scaled, y_test, SEQ_LEN)

n_features = X_seq_train.shape[2]
n_classes = len(class_mapping)

print("\nSequence shapes for DL:")
print("X_seq_train:", X_seq_train.shape, "y_seq_train:", y_seq_train.shape)
print("X_seq_test :", X_seq_test.shape, "y_seq_test :", y_seq_test.shape)

early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)


Sequence shapes for DL:
X_seq_train: (3990, 10, 20) y_seq_train: (3990,)
X_seq_test : (990, 10, 20) y_seq_test : (990,)


In [100]:
lstm_model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(SEQ_LEN, n_features)),
    Dropout(0.2),
    LSTM(32),
    Dense(32, activation="relu"),
    Dense(n_classes, activation="softmax")
])

lstm_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

lstm_model.fit(
    X_seq_train, y_seq_train,
    validation_split=0.1,
    epochs=15,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

lstm_pred = np.argmax(lstm_model.predict(X_seq_test), axis=1)
eval_and_store("LSTM", y_seq_test, lstm_pred)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step

LSTM
Accuracy: 0.6303030303030303
F1 (macro): 0.25774473358116484
              precision    recall  f1-score   support

        High       0.63      1.00      0.77       624
         Low       0.00      0.00      0.00        70
      Medium       0.00      0.00      0.00       296

    accuracy                           0.63       990
   macro avg       0.21      0.33      0.26       990
weighted avg       0.40      0.63      0.49       990



In [101]:
cnn_model = Sequential([
    Conv1D(64, kernel_size=3, activation="relu", input_shape=(SEQ_LEN, n_features)),
    MaxPooling1D(pool_size=2),
    Conv1D(32, kernel_size=3, activation="relu"),
    Flatten(),
    Dense(32, activation="relu"),
    Dense(n_classes, activation="softmax")
])

cnn_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

cnn_model.fit(
    X_seq_train, y_seq_train,
    validation_split=0.1,
    epochs=15,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

cnn_pred = np.argmax(cnn_model.predict(X_seq_test), axis=1)
eval_and_store("CNN_1D", y_seq_test, cnn_pred)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step

CNN_1D
Accuracy: 0.6303030303030303
F1 (macro): 0.25774473358116484
              precision    recall  f1-score   support

        High       0.63      1.00      0.77       624
         Low       0.00      0.00      0.00        70
      Medium       0.00      0.00      0.00       296

    accuracy                           0.63       990
   macro avg       0.21      0.33      0.26       990
weighted avg       0.40      0.63      0.49       990



In [102]:
print("\n\n=== SUMMARY (Accuracy, F1) ===")
for name, (acc, f1) in results.items():
    print(f"{name:18s} -> Acc: {acc:.4f}, F1: {f1:.4f}")



best_model = gb_clf   

with open("traffic_best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("feature_columns.pkl", "wb") as f:
    pickle.dump(feature_cols, f)

with open("class_mapping.pkl", "wb") as f:
    pickle.dump(class_mapping, f)


with open("dl_scaler.pkl", "wb") as f:
    pickle.dump(scaler_dl, f)

print("\nSaved: traffic_best_model.pkl, feature_columns.pkl, class_mapping.pkl, dl_scaler.pkl")



=== SUMMARY (Accuracy, F1) ===
LogisticRegression -> Acc: 0.6980, F1: 0.5466
DecisionTree       -> Acc: 0.8440, F1: 0.7951
RandomForest       -> Acc: 0.8510, F1: 0.6603
GradientBoosting   -> Acc: 0.8670, F1: 0.7968
LSTM               -> Acc: 0.6303, F1: 0.2577
CNN_1D             -> Acc: 0.6303, F1: 0.2577

Saved: traffic_best_model.pkl, feature_columns.pkl, class_mapping.pkl, dl_scaler.pkl


In [103]:
import os
os.getcwd()


'C:\\Users\\HP'

In [104]:
import os
os.listdir()


['.arduinoIDE',
 '.bash_history',
 '.cache',
 '.devdock',
 '.docker',
 '.dotnet',
 '.git',
 '.gitconfig',
 '.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.keras',
 '.lesshst',
 '.matplotlib',
 '.python_history',
 '.streamlit',
 '.templateengine',
 '.viminfo',
 '.vscode',
 '11aug.ipynb',
 '12_nov_convo.ipynb',
 '2025_Product_Track_Navnidhi_102317298-Copy1.ipynb',
 '2025_Product_Track_Navnidhi_102317298.ipynb',
 'ai.py',
 'aiproject-Copy1.ipynb',
 'aiproject.ipynb',
 'AIprojectoriginal.ipynb',
 'amex_data.csv',
 'api.ipynb',
 'app.py',
 'AppData',
 'Application Data',
 'Assignment 8.ipynb',
 'assignment2 _preprocessing.ipynb',
 'Assignment8 cc.ipynb',
 'Assignment9&10.ipynb',
 'aug11part2.ipynb',
 'best_model.pth',
 'ccpredf.ipynb',
 'class_mapping.pkl',
 'Contacts',
 'Cookies',
 'decision_tree_model.joblib',
 'deeplearning_17nov.ipynb',
 'Desktop',
 'dl_scaler.pkl',
 'Documents',
 'doc_outline_multilang',
 'Downloads',
 'ensemble.joblib',
 'ensemble_model_predictions.ipynb',
 'Favori