# Final Model: Smart Factory Energy Prediction

This notebook contains the cleaned and optimized final version of the model training pipeline using Random Forest Regressor. It includes only the code required to:
- Load and preprocess the data
- Extract relevant features
- Train and evaluate the model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [14]:
# Load data
df = pd.read_csv(r"C:\Users\Sagar\Documents\DS-Intern-Assignment--SagarGaur-\data\data.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])
df_cleaned=df.copy()
cols_to_convert = df.columns.drop(['timestamp','random_variable1','random_variable2'])
df_cleaned[cols_to_convert] = df_cleaned[cols_to_convert].apply(pd.to_numeric, errors='coerce')
df_cleaned[cols_to_convert] = df_cleaned[cols_to_convert].fillna(method='ffill').fillna(method='bfill')
col_temp=df_cleaned.columns.drop('timestamp')
df_cleaned[col_temp] = df_cleaned[col_temp].where(df_cleaned[col_temp] >= 0, np.nan)
df_cleaned[cols_to_convert] = df_cleaned[cols_to_convert].fillna(method='ffill').fillna(method='bfill')
df_cleaned['hour'] = df_cleaned['timestamp'].dt.hour
df_cleaned['day_of_week'] = df_cleaned['timestamp'].dt.dayofweek
df_cleaned['month'] = df_cleaned['timestamp'].dt.month
df_cleaned['is_weekend'] = df_cleaned['day_of_week'].isin([5, 6]).astype(int)
df_cleaned['hour_sin'] = np.sin(2 * np.pi * df_cleaned['hour'] / 24)
df_cleaned['hour_cos'] = np.cos(2 * np.pi * df_cleaned['hour'] / 24)
df_cleaned = df_cleaned.drop(columns=['timestamp'])
# Define function to remove outliers using IQR
def remove_outliers_iqr(df, columns, factor=10):
    df_clean = df.copy()
    for col in columns:
        if df_clean[col].dtype in ['float64', 'int64']:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - factor * IQR
            upper_bound = Q3 + factor * IQR
            df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean

# Example: choose important columns to check (you can expand this list)
columns_to_check = [
    'equipment_energy_consumption',
    'lighting_energy',
    'zone1_temperature',
    'zone2_temperature',
    'outdoor_temperature'
]

# Apply outlier removal
df_no_outliers = remove_outliers_iqr(df_cleaned, columns_to_check)

# Print shape before and after
print("Before:", df_cleaned.shape)
print("After :", df_no_outliers.shape)


Before: (16857, 34)
After : (16539, 34)


  df_cleaned[cols_to_convert] = df_cleaned[cols_to_convert].fillna(method='ffill').fillna(method='bfill')
  df_cleaned[cols_to_convert] = df_cleaned[cols_to_convert].fillna(method='ffill').fillna(method='bfill')


In [15]:
df_no_outliers.columns

Index(['equipment_energy_consumption', 'lighting_energy', 'zone1_temperature',
       'zone1_humidity', 'zone2_temperature', 'zone2_humidity',
       'zone3_temperature', 'zone3_humidity', 'zone4_temperature',
       'zone4_humidity', 'zone5_temperature', 'zone5_humidity',
       'zone6_temperature', 'zone6_humidity', 'zone7_temperature',
       'zone7_humidity', 'zone8_temperature', 'zone8_humidity',
       'zone9_temperature', 'zone9_humidity', 'outdoor_temperature',
       'atmospheric_pressure', 'outdoor_humidity', 'wind_speed',
       'visibility_index', 'dew_point', 'random_variable1', 'random_variable2',
       'hour', 'day_of_week', 'month', 'is_weekend', 'hour_sin', 'hour_cos'],
      dtype='object')

In [16]:
# Define features and target
X = df_no_outliers.drop(columns=['equipment_energy_consumption','hour','random_variable2','random_variable1','month','is_weekend'])
y = df_no_outliers['equipment_energy_consumption']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions and evaluation
y_pred = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.2f}")

RMSE: 59.66
MAE: 30.30
R² Score: 0.56
