In [2]:
! pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.3/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 1.2 MB/s eta 0:00:47
    --------------------------------------- 0.8/56.8 MB 1.4 MB/s eta 0:00:41
    --------------------------------------- 1.0/56.8 MB 1.4 MB/s eta 0:00:39
    --------------------------------------- 1.3/56.8 MB 1.4 MB/s eta 0:00:40
   - -------------------------------------- 1.6/56.8 MB 1.4 MB/s eta 0:00:39
   - -------------------------------------- 1.6/56.8 MB 1.4 MB/s eta 0:00:39
   - -------------------------------------- 2.1/56.8 MB 1.3 MB/s eta 0:00:43
   - -------------------------------------- 2.4/56.8 MB 1.3 MB/s eta 0:00:44
   - -------------------------------------- 2.4/56.8 MB 1.3 MB/s eta 0:00:44
   - ---------------

In [3]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras import models, layers

# -----------------------------
# CONFIGURATION
# -----------------------------
BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Public Transport Reliability Predictor"
GTFS_DIR = os.path.join(BASE_DIR, "archive", "GTFS")
OUTPUT_DIR = BASE_DIR

FILES = {
    "stop_times": os.path.join(GTFS_DIR, "stop_times.csv"),
    "stops": os.path.join(GTFS_DIR, "stops.csv"),
    "trips": os.path.join(GTFS_DIR, "trips.csv"),
    "agency": os.path.join(GTFS_DIR, "agency.csv"),
    "calendar": os.path.join(GTFS_DIR, "calendar.csv"),
    "fare_attributes": os.path.join(GTFS_DIR, "fare_attributes.csv"),
    "fare_rules": os.path.join(GTFS_DIR, "fare_rules.csv"),
    "routes": os.path.join(GTFS_DIR, "routes.csv"),
}

# -----------------------------
# LOAD DATA
# -----------------------------
print("[INFO] Loading GTFS data...")

dfs = {name: pd.read_csv(path, low_memory=False) for name, path in FILES.items()}
for k, v in dfs.items():
    print(f"[OK] {k} → {v.shape}")

# -----------------------------
# MERGE CORE TABLES
# -----------------------------
print("[INFO] Merging core GTFS tables...")

df = (
    dfs["stop_times"]
    .merge(dfs["trips"], on="trip_id", how="left")
    .merge(dfs["routes"], on="route_id", how="left")
    .merge(dfs["stops"], on="stop_id", how="left")
)

# -----------------------------
# FEATURE ENGINEERING
# -----------------------------
print("[INFO] Performing feature engineering...")

def time_to_minutes(t):
    try:
        h, m, s = map(int, str(t).split(":"))
        return h * 60 + m + s / 60
    except Exception:
        return np.nan

df["arrival_mins"] = df["arrival_time"].apply(time_to_minutes)
df["departure_mins"] = df["departure_time"].apply(time_to_minutes)
df["stop_sequence"] = df["stop_sequence"].fillna(0)

# Delay proxy (difference between departure and arrival)
df["delay_proxy"] = df["departure_mins"] - df["arrival_mins"]
df["delay_proxy"] = df["delay_proxy"].fillna(df["delay_proxy"].mean())

# Encode route_id and stop_id
le_route = LabelEncoder()
le_stop = LabelEncoder()

df["route_enc"] = le_route.fit_transform(df["route_id"].astype(str))
df["stop_enc"] = le_stop.fit_transform(df["stop_id"].astype(str))

# Select features
feature_cols = ["route_enc", "stop_enc", "stop_sequence", "arrival_mins"]
target_col = "delay_proxy"

df_model = df[feature_cols + [target_col]].dropna()

X = df_model[feature_cols].values
y = df_model[target_col].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# -----------------------------
# TRAIN BASELINE XGBoost MODEL
# -----------------------------
print("[INFO] Training XGBoost baseline model...")

xgb_model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"[RESULT] XGBoost RMSE: {rmse:.4f}, R²: {r2:.4f}")

# -----------------------------
# TRAIN SIMPLE LSTM MODEL
# -----------------------------
print("[INFO] Training small LSTM model for temporal pattern...")

X_train_seq = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_seq = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

lstm_model = models.Sequential([
    layers.LSTM(32, input_shape=(1, X_train.shape[1])),
    layers.Dense(16, activation="relu"),
    layers.Dense(1)
])

lstm_model.compile(optimizer="adam", loss="mse")
lstm_model.fit(X_train_seq, y_train, epochs=5, batch_size=32, verbose=1)

lstm_preds = lstm_model.predict(X_test_seq).flatten()
lstm_rmse = np.sqrt(mean_squared_error(y_test, lstm_preds))
lstm_r2 = r2_score(y_test, lstm_preds)

print(f"[RESULT] LSTM RMSE: {lstm_rmse:.4f}, R²: {lstm_r2:.4f}")

# -----------------------------
# SAVE ARTIFACTS (.h5, .pkl, .yaml, .json)
# -----------------------------
print("[INFO] Saving all model artifacts...")

h5_path = os.path.join(OUTPUT_DIR, "TransitGuard_model.h5")
pkl_path = os.path.join(OUTPUT_DIR, "TransitGuard_model.pkl")
yaml_path = os.path.join(OUTPUT_DIR, "TransitGuard_config.yaml")
json_path = os.path.join(OUTPUT_DIR, "TransitGuard_results.json")

# Save LSTM
lstm_model.save(h5_path)

# Save XGBoost
joblib.dump(xgb_model, pkl_path)

# Save YAML (config / params)
config = {
    "model": "TransitGuard Hybrid (XGBoost + LSTM)",
    "features": feature_cols,
    "scaler_mean": scaler.mean_.tolist(),
    "scaler_var": scaler.var_.tolist(),
    "sample_size": int(len(df_model)),
}
with open(yaml_path, "w") as f:
    yaml.dump(config, f)

# Save JSON (results)
results = {
    "RMSE_XGBoost": float(rmse),
    "R2_XGBoost": float(r2),
    "RMSE_LSTM": float(lstm_rmse),
    "R2_LSTM": float(lstm_r2),
    "rows_used": int(len(df_model)),
    "columns": feature_cols,
}
with open(json_path, "w") as f:
    json.dump(results, f, indent=4)

print("[INFO] ✅ All artifacts saved successfully in:")
print("   ", OUTPUT_DIR)
print("[FILES]")
print("   ", os.path.basename(h5_path))
print("   ", os.path.basename(pkl_path))
print("   ", os.path.basename(yaml_path))
print("   ", os.path.basename(json_path))



[INFO] Loading GTFS data...
[OK] stop_times → (2250290, 5)
[OK] stops → (4192, 6)
[OK] trips → (50355, 4)
[OK] agency → (1, 7)
[OK] calendar → (1, 10)
[OK] fare_attributes → (955661, 6)
[OK] fare_rules → (969871, 4)
[OK] routes → (1270, 5)
[INFO] Merging core GTFS tables...
[INFO] Performing feature engineering...
[INFO] Training XGBoost baseline model...
[RESULT] XGBoost RMSE: 0.0000, R²: 1.0000
[INFO] Training small LSTM model for temporal pattern...


Epoch 1/5

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[RESULT] LSTM RMSE: 0.0000, R²: 0.0000
[INFO] Saving all model artifacts...
[INFO] ✅ All artifacts saved successfully in:
    C:\Users\NXTWAVE\Downloads\Public Transport Reliability Predictor
[FILES]
    TransitGuard_model.h5
    TransitGuard_model.pkl
    TransitGuard_config.yaml
    TransitGuard_results.json


  saving_api.save_model(
