In [5]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.metrics import mean_squared_error

# -----------------------------------------------------
# PATHS
# -----------------------------------------------------
TEST_DATA_PATH = "C:\\Users\\User\\OneDrive\\Desktop\\INFOSYS\\Data\\dayTEST.csv"
TRAIN_PREPROCESSED_PATH = "C:\\Users\\User\\OneDrive\\Desktop\\INFOSYS\\Data\\preprocessed_day.csv"

SAVED_MODELS_DIR = "./saved_models"
BEST_MODEL_DIR = "./saved_models/bestModel"
SCALER_PATH = f"{SAVED_MODELS_DIR}/scaler.pkl"

os.makedirs(BEST_MODEL_DIR, exist_ok=True)



In [6]:

# -----------------------------------------------------
# 1. LOAD TEST DATA
# -----------------------------------------------------
df = pd.read_csv(TEST_DATA_PATH)
actual_cnt = df["cnt"].copy()

# Drop columns not used in training
df = df.drop(["instant", "dteday", "casual", "registered"], axis=1, errors="ignore")

# One-hot encode
df = pd.get_dummies(
    df,
    columns=["season", "mnth", "weekday", "weathersit"],
    drop_first=True
)

# -----------------------------------------------------
# 2. MATCH TRAINING FEATURE COLUMNS EXACTLY
# -----------------------------------------------------
train_df = pd.read_csv(TRAIN_PREPROCESSED_PATH)
required_cols = train_df.drop("cnt", axis=1).columns

# Add missing columns
for col in required_cols:
    if col not in df.columns:
        df[col] = 0

# Fix column order
df = df[required_cols]


# -----------------------------------------------------
# 3. APPLY SCALING
# -----------------------------------------------------
scaler = pickle.load(open(SCALER_PATH, "rb"))

num_cols = ["temp", "atemp", "hum", "windspeed"]
df[num_cols] = scaler.transform(df[num_cols])


# -----------------------------------------------------
# 4. LOAD MODELS
# -----------------------------------------------------
model_paths = {
    "RandomForest": f"{SAVED_MODELS_DIR}/RandomForest.pkl",
    "DecisionTree": f"{SAVED_MODELS_DIR}/DecisionTree.pkl",
    "GradientBoost": f"{SAVED_MODELS_DIR}/GradientBoost.pkl"
}

models = {name: pickle.load(open(path, "rb")) for name, path in model_paths.items()}


# -----------------------------------------------------
# 5. PREDICT WITH ALL MODELS
# -----------------------------------------------------
pred_results = {}
rmse_scores = {}

for name, model in models.items():
    preds = model.predict(df)
    rmse = np.sqrt(mean_squared_error(actual_cnt, preds))

    pred_results[name] = preds
    rmse_scores[name] = rmse

    print(f"{name} → RMSE: {rmse:.4f}")


# -----------------------------------------------------
# 6. SELECT BEST MODEL
# -----------------------------------------------------
best_model_name = min(rmse_scores, key=rmse_scores.get)
best_model = models[best_model_name]

print(f"\n Best Model = {best_model_name}")


# -----------------------------------------------------
# 7. SAVE BEST MODEL
# -----------------------------------------------------
best_model_path = f"{BEST_MODEL_DIR}/{best_model_name}.pkl"
pickle.dump(best_model, open(best_model_path, "wb"))

print(f"Best Model Saved at: {best_model_path}")

# -----------------------------------------------------
# 8. SAVE & SHOW PREDICTIONS
# -----------------------------------------------------
results = pd.DataFrame({
    "Actual_cnt": actual_cnt,
    "Predicted_cnt": pred_results[best_model_name]
})

print("\n Showing first 20 prediction rows:\n")
print(results.head(20))

results.to_csv("./saved_models/bestModel/Predictions.csv", index=False)
print("\n Sample Predictions (first 10 rows):\n")
print(results.sample(10).sort_index())



RandomForest → RMSE: 1731.5625
DecisionTree → RMSE: 1813.6385
GradientBoost → RMSE: 3020.0204

 Best Model = RandomForest
Best Model Saved at: ./saved_models/bestModel/RandomForest.pkl

 Showing first 20 prediction rows:

    Actual_cnt  Predicted_cnt
0          985        2247.61
1          801        2322.08
2         1349        2514.96
3         1562        2532.45
4         1600        2533.60
5         1606        2533.66
6         1510        2494.39
7          959        2247.61
8          822        2428.40
9         1321        2514.96
10        1263        2473.31
11        1162        2533.60
12        1406        2533.66
13        1421        2543.16
14        1248        2247.61
15        1204        2428.40
16        1000        2290.94
17         683        2473.31
18        1650        2476.08
19        1927        2483.87

 Sample Predictions (first 10 rows):

     Actual_cnt  Predicted_cnt
26          431        2533.66
27         1167        2494.39
38         1530 