In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from codecarbon import EmissionsTracker

# --- Load and filter dataset ---
df = pd.read_csv("Camera_Traffic_Counts2_20250918.csv")

# Keep only 2019
df = df[df["Year"] == 2019].copy()

# Sort chronologically to prevent leakage
df = df.sort_values(["Month", "Day", "Hour"], kind="stable")

# Select relevant columns
BASE_FEATURES = ["Day of Week", "Hour"]
TARGET_COL = "Volume"

X_raw = df[BASE_FEATURES].copy()
y = df[TARGET_COL].astype(float)

print(f"Rows: {len(df):,}")
print("Raw features used:", BASE_FEATURES)

Rows: 14,717,624
Raw features used: ['Day of Week', 'Hour']


In [2]:
# 1. Temporal split: Jan–Oct = train, Nov–Dec = test
train_mask = df["Month"] <= 10
test_mask  = df["Month"] > 10

X_train_raw, X_test_raw = X_raw[train_mask], X_raw[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

print(f"Train size: {len(X_train_raw):,} | Test size: {len(X_test_raw):,}")

# 2. One-hot encode Day of Week
dow_cats = list(range(7))  # 0–6
train_dow = pd.get_dummies(
    pd.Categorical(X_train_raw["Day of Week"], categories=dow_cats),
    prefix="DOW", drop_first=True
)
test_dow = pd.get_dummies(
    pd.Categorical(X_test_raw["Day of Week"], categories=dow_cats),
    prefix="DOW", drop_first=True
)
test_dow = test_dow.reindex(columns=train_dow.columns, fill_value=0)

# Combine Hour + DOW one-hot
X_train = pd.concat([X_train_raw[["Hour"]].reset_index(drop=True), train_dow.reset_index(drop=True)], axis=1)
X_test  = pd.concat([X_test_raw[["Hour"]].reset_index(drop=True), test_dow.reset_index(drop=True)], axis=1)

FEATURES = X_train.columns.tolist()
print("\nFinal feature set:", FEATURES)

# 3. Train Linear Regression
tracker = EmissionsTracker(measure_power_secs=1, save_to_file=False, log_level="error")
tracker.start()

model = LinearRegression().fit(X_train, y_train)

_ = tracker.stop()
train_data = tracker.final_emissions_data

print("\n=== Training Energy ===")
print(f"Energy consumed: {train_data.energy_consumed:.6f} kWh")
print(f"CO₂ emissions:  {train_data.emissions:.6f} kg CO₂eq")

# 4. Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # <-- compatible for all sklearn versions
r2 = r2_score(y_test, y_pred)

print("\n=== Test Results ===")
print(f"MAE:  {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²:   {r2:.4f}")

# 5. Coefficients
coef = pd.Series(model.coef_, index=FEATURES)
print("\n=== Linear Regression Coefficients ===")
print(coef)
print(f"Intercept: {model.intercept_:.4f}")

Train size: 11,706,114 | Test size: 3,011,510





Final feature set: ['Hour', 'DOW_1', 'DOW_2', 'DOW_3', 'DOW_4', 'DOW_5', 'DOW_6']

=== Training Energy ===
Energy consumed: 0.000119 kWh
CO₂ emissions:  0.000056 kg CO₂eq

=== Test Results ===
MAE:  30.31
RMSE: 59.12
R²:   0.0013

=== Linear Regression Coefficients ===
Hour     0.474548
DOW_1    5.216859
DOW_2    5.951562
DOW_3    6.090246
DOW_4    6.136225
DOW_5    6.656329
DOW_6    2.313294
dtype: float64
Intercept: 15.2421


In [3]:
import time

# 3. Train Linear Regression (with training time)
tracker = EmissionsTracker(measure_power_secs=1, save_to_file=False, log_level="error")

start_time = time.time()             # <-- start timing
tracker.start()

model = LinearRegression().fit(X_train, y_train)

_ = tracker.stop()
end_time = time.time()               # <-- end timing

train_data = tracker.final_emissions_data
training_time = end_time - start_time

print("\n=== Training Energy & Time ===")
print(f"Training time: {training_time:.2f} seconds")
print(f"Energy consumed: {train_data.energy_consumed:.6f} kWh")
print(f"CO₂ emissions:  {train_data.emissions:.6f} kg CO₂eq")



=== Training Energy & Time ===
Training time: 3.91 seconds
Energy consumed: 0.000108 kWh
CO₂ emissions:  0.000052 kg CO₂eq
