In [None]:
import os
import joblib
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# =========================================================
# PATHS
# =========================================================
TEST_DATA_PATH = r"D:\RideWise-Predicting-Bike-sharing-Demand\Data\day(new).csv"
SAVE_MODEL_DIR = r"D:\RideWise-Predicting-Bike-sharing-Demand\Modeling\Saved_Models_files"
SCALER_PATH = os.path.join(SAVE_MODEL_DIR, "scaler.pkl")


# =========================================================
# LOAD & PREPROCESS TEST DATA
# =========================================================
def load_and_preprocess_test_data(path):
    df = pd.read_csv(path)

    # Drop unused columns
    columns_to_drop = ['instant', 'casual', 'registered', 'dteday', 'atemp']
    df.drop(columns=columns_to_drop, inplace=True)

    # Weekend feature
    df['weekend'] = ((df['weekday'] == 0) | (df['weekday'] == 6)).astype(int)

    # Target & Features
    X = df.drop(columns=['cnt'])
    y = df['cnt']

    return X, y


# =========================================================
# LOAD SCALER
# =========================================================
def load_scaler(path):
    return joblib.load(path)



# =========================================================
# LOAD ALL MODELS
# =========================================================
def load_all_models(directory):
    models = {}
    for file in os.listdir(directory):
        if file.endswith(".pkl") and file != "scaler.pkl":
            model_name = file.replace(".pkl", "")
            models[model_name] = joblib.load(os.path.join(directory, file))
    return models


# =========================================================
# PREDICT (Reverse Transform if Needed)
# =========================================================
def predict_with_postprocessing(model, X_scaled):
    """Model was trained on sqrt(cnt), so predictions must be squared."""
    return np.round(model.predict(X_scaled) ** 2, 0)


# =========================================================
# EVALUATE MODEL PERFORMANCE
# =========================================================
def evaluate_model(y_true, y_pred):
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "R2": r2_score(y_true, y_pred)
    }


# =========================================================
# PLOT FUNCTIONS
# =========================================================
def plot_actual_vs_predicted_scatter(y_true, y_pred, model_name):
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, y_pred, alpha=0.6)
    plt.plot([0, max(y_true)], [0, max(y_true)], color='red', linestyle='--')
    plt.xlabel("Actual Count")
    plt.ylabel("Predicted Count")
    plt.title(f"Actual vs Predicted: {model_name}")
    plt.tight_layout()
    plt.show()


def plot_distribution_histogram(y_true, y_pred, model_name):
    plt.figure(figsize=(8, 6))
    plt.hist(y_true, bins=30, alpha=0.6, label='Actual Count')
    plt.hist(y_pred, bins=30, alpha=0.6, label=f'{model_name} Predictions')
    plt.xlabel("Bike Count")
    plt.ylabel("Frequency")
    plt.title(f"Distribution: Actual vs {model_name} Predictions")
    plt.legend()
    plt.tight_layout()
    plt.show()


def plot_overlay_all_models(y_true, all_preds):
    plt.figure(figsize=(8, 6))

    for name, pred in all_preds.items():
        plt.hist(pred, bins=30, alpha=0.3, label=f"{name} Pred")

    plt.hist(y_true, bins=30, alpha=0.7, color="black", label="Actual Count")
    plt.xlabel("Bike Count")
    plt.ylabel("Frequency")
    plt.title("Overlay: Actual vs Predictions (All Models)")
    plt.legend()
    plt.tight_layout()
    plt.show()


# =========================================================
# MAIN EXECUTION WORKFLOW
# =========================================================
def main():
    print("\nüìå Loading and preprocessing test data...")
    X_test, y_test = load_and_preprocess_test_data(TEST_DATA_PATH)

    print("üìå Loading scaler...")
    scaler = load_scaler(SCALER_PATH)
    X_test_scaled = scaler.transform(X_test)

    print("üìå Loading saved models...")
    models = load_all_models(SAVE_MODEL_DIR)

    all_predictions = {}
    model_errors = {}

    print("\nüìå Generating predictions and evaluating models...\n")
    for name, model in models.items():
        y_pred = predict_with_postprocessing(model, X_test_scaled)
        all_predictions[name] = y_pred
        model_errors[name] = evaluate_model(y_test, y_pred)

        print(f"Model: {name}")
        print(model_errors[name])
        print("-" * 40)

    # Identify best model
    accurate_model = min(model_errors, key=lambda x: model_errors[x]["MAE"])
    print(f"\nüèÜ Best Model: {accurate_model}")
    accurate_pred = all_predictions[accurate_model]
    joblib.dump(accurate_model, "Accurate_model.pkl")

    # PLOTS
    plot_actual_vs_predicted_scatter(y_test, accurate_pred, accurate_model)
    plot_distribution_histogram(y_test, accurate_pred, accurate_model)
    plot_overlay_all_models(y_test, all_predictions)


# =========================================================
# RUN SCRIPT
# =========================================================
if __name__ == "__main__":
    main()



üìå Loading and preprocessing test data...
üìå Loading scaler...
üìå Loading saved models...

üìå Generating predictions and evaluating models...





TypeError: got an unexpected keyword argument 'squared'