In [6]:
!conda install --yes --file requirements.txt

# Smart Factory Energy Prediction Challenge

## Goals:
- Analyze the provided sensor data to identify patterns and relationships between environmental factors and equipment energy consumption
- Build a robust regression model to predict equipment energy consumption
- Evaluate the model's performance using appropriate metrics
- Provide actionable insights and recommendations for reducing energy consumption

In [None]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error

## Data Preprocessing
### Importing the Datasets

In [None]:
df= pd.read_csv("./data/data.csv", parse_dates=['timestamp'])

### Handling missing data

In [None]:
print(df.isna().sum())

#### Data Types

**Time-Series**
- timestamp

**Numeric**
- timestamp
- equipment_energy_consumption
- lighting_energy
- zone1_temperature
- zone1_humidity
- zone2_temperature
- zone2_humidity
- zone3_temperature
- zone3_humidity
- zone4_temperature
- zone4_humidity
- zone5_temperature
- zone5_humidity
- zone6_temperature
- zone6_humidity
- zone7_temperature
- zone7_humidity
- zone8_temperature
- zone8_humidity
- zone9_temperature
- zone9_humidity
- outdoor_temperature
- atmospheric_pressure
- outdoor_humidity
- wind_speed
- visibility_index
- dew_point
- random_variable1
- random_variable2

Since there are no missing values for timestamp filling other columns. Using mean for other numeric data

In [None]:
def verify_df_missing(df, msg):
    print(msg)
    print(df.isna().sum())

In [None]:
def fillna(df):
    # timestamp is sorted first for proper forward-filling
    df.sort_values('timestamp', inplace=True)
    # Fill all other numeric columns with their mean
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [None]:
# Verify results
fillna(df)
verify_df_missing(df, "Missing values after treatment:")

Missing values after treatment:
timestamp                         0
equipment_energy_consumption    844
lighting_energy                 809
zone1_temperature               867
zone1_humidity                  801
zone2_temperature               853
zone2_humidity                    0
zone3_temperature                 0
zone3_humidity                    0
zone4_temperature                 0
zone4_humidity                    0
zone5_temperature                 0
zone5_humidity                    0
zone6_temperature                 0
zone6_humidity                    0
zone7_temperature                 0
zone7_humidity                    0
zone8_temperature                 0
zone8_humidity                    0
zone9_temperature                 0
zone9_humidity                    0
outdoor_temperature               0
atmospheric_pressure              0
outdoor_humidity                  0
wind_speed                        0
visibility_index                  0
dew_point                       

In [None]:
num_cols = ['equipment_energy_consumption', 'lighting_energy', 'zone1_temperature', 'zone1_humidity', 'zone2_temperature']

def column_converter(df: pd.DataFrame):
    for col in num_cols:
        if col in df.columns:  # Check if column exists
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else:
            print(f"Warning: Column {col} not found in DataFrame")

column_converter(df, num_cols)
fillna(df)
verify_df_missing(df, "Missing values after treatment:")