In [12]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

print("Loading cumulative dataset...")
df = pd.read_csv('daily_MQ-7_data.csv')

# Define inputs and target
features = ['Day_Of_Week', 'Click_Hour', 'Click_Minute', 'Current_Cumulative_Sum']
X = df[features]
y = df['Target_Final_Daily_Sum']

# Because this is randomized snapshot data (not a continuous time-series wave), 
# we CAN safely use a standard random train/test split here.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Daily Prediction Model...")
model = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42)
model.fit(X_train, y_train)

# Evaluate
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)

# We can actually use Percentage Error here because the Target is a massive cumulative sum
mape = mean_absolute_percentage_error(y_test, predictions) * 100

print(f"\n--- Model Performance ---")
print(f"Mean Absolute Error: {mae:.0f} ADC points (out of ~100,000+ daily average)")
print(f"Average Error Percentage: {mape:.2f}%")

# Save the new model
joblib.dump(model, 'daily_emission_model.joblib')
print("\nSuccess! Daily Extrapolation Model saved as 'daily_emission_model.joblib'")

Loading cumulative dataset...
Training Daily Prediction Model...

--- Model Performance ---
Mean Absolute Error: 4923 ADC points (out of ~100,000+ daily average)
Average Error Percentage: 4.46%

Success! Daily Extrapolation Model saved as 'daily_emission_model.joblib'
