# AI-Powered Energy Demand Forecasting

**Notebook:** `energy_demand_forecasting_notebook.ipynb`

Author: Your Name  
Internship ID: INTERNSHIP_17513641056863b20937d78  
Theme: Sustainable Energy & Efficiency

----
**Notes before running:**
- This notebook first tries to load a CSV file `./data/energy_consumption.csv` (you can put your dataset there).  
- If that file is not found, the notebook generates a synthetic hourly dataset (1 year) so you can run end-to-end immediately.  
- Replace the synthetic data with your real dataset and adjust column names if needed.  
- Keep the final `.ipynb` size < 10MB — remove large saved models or heavy logs before submission.


## 1) Imports & Utility Functions

In [ ]:
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denom = np.where(y_true==0, 1e-6, y_true)
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


## 2) Load dataset or generate synthetic data

Place your CSV at `./data/energy_consumption.csv`. Expected columns (if you use your own data):
- `timestamp` (datetime-like)  
- `energy` or `load` (target variable: energy demand)  
- Optional weather features: `temperature`, `humidity`, `wind_speed`, `solar_radiation`

If the file is not found, synthetic hourly data for 1 year will be generated.

In [ ]:
# Try to load local dataset
DATA_PATH = './data/energy_consumption.csv'
if os.path.exists(DATA_PATH):
    df = pd.read_csv(DATA_PATH)
    print('Loaded dataset from', DATA_PATH)
else:
    # Generate synthetic hourly data for 1 year
    print('No local dataset found. Generating synthetic hourly data for 1 year...')
    rng = pd.date_range(start='2023-01-01', periods=24*365, freq='H')
    np.random.seed(42)
    base = 200 + 40*np.sin(2 * np.pi * (rng.hour) / 24)  # daily seasonality
    seasonal = 20*np.sin(2 * np.pi * (rng.dayofyear) / 365)  # yearly seasonality
    noise = np.random.normal(0, 8, len(rng))
    temperature = 20 + 10*np.sin(2 * np.pi * (rng.dayofyear) / 365) + np.random.normal(0,2,len(rng))
    humidity = 60 + 10*np.cos(2 * np.pi * (rng.hour) / 24) + np.random.normal(0,3,len(rng))
    energy = base + seasonal - 0.8*temperature + 0.3*humidity + noise
    df = pd.DataFrame({
        'timestamp': rng,
        'energy': energy,
        'temperature': temperature,
        'humidity': humidity
    })
    os.makedirs('./data', exist_ok=True)
    df.to_csv(DATA_PATH, index=False)
    print('Synthetic dataset saved to', DATA_PATH)

# Quick check
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp').reset_index(drop=True)
df.head()


## 3) Exploratory Data Analysis (EDA)
Plot the time series and look at statistics.

In [ ]:
print('Dataset length:', len(df))
print(df[['energy','temperature','humidity']].describe().T)

plt.figure(figsize=(12,4))
plt.plot(df['timestamp'][:24*14], df['energy'][:24*14])
plt.title('Energy demand (first 14 days)')
plt.xlabel('Time')
plt.ylabel('Energy')
plt.tight_layout()
plt.show()

# Daily aggregation sample
df_daily = df.set_index('timestamp').resample('D').mean().reset_index()
plt.figure(figsize=(12,4))
plt.plot(df_daily['timestamp'][:90], df_daily['energy'][:90])
plt.title('Daily average energy (first 90 days)')
plt.xlabel('Date')
plt.ylabel('Energy')
plt.tight_layout()
plt.show()


## 4) Feature Engineering
- Create lag features (previous 1, 24, 168 hours)
- Time features: hour, dayofweek, month
- Optionally rolling means

In [ ]:
df_fe = df.copy()
df_fe['hour'] = df_fe['timestamp'].dt.hour
df_fe['dayofweek'] = df_fe['timestamp'].dt.dayofweek
df_fe['month'] = df_fe['timestamp'].dt.month

# Lag features
for lag in [1, 24, 168]:
    df_fe[f'lag_{lag}'] = df_fe['energy'].shift(lag)

# Rolling means
df_fe['rmean_24'] = df_fe['energy'].rolling(window=24).mean().shift(1)
df_fe['rmean_168'] = df_fe['energy'].rolling(window=168).mean().shift(1)

df_fe = df_fe.dropna().reset_index(drop=True)
print('After feature engineering length:', len(df_fe))
df_fe.head()


## 5) Train/test split
We will do a time-based split (no shuffling). Use the last 20% as test.

In [ ]:
train_size = int(0.8 * len(df_fe))
train = df_fe.iloc[:train_size].copy()
test = df_fe.iloc[train_size:].copy()

features = ['lag_1','lag_24','lag_168','rmean_24','rmean_168','hour','dayofweek','month','temperature','humidity']
target = 'energy'

X_train = train[features].values
y_train = train[target].values
X_test = test[features].values
y_test = test[target].values

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


## 6) Baseline Models: Linear Regression and Random Forest

In [ ]:
# Scale features for Linear Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
pred_lr = lr.predict(X_test_scaled)
print('Linear Regression RMSE:', rmse(y_test, pred_lr))
print('Linear Regression MAPE:', mape(y_test, pred_lr))

# Random Forest (no scaling needed)
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
print('Random Forest RMSE:', rmse(y_test, pred_rf))
print('Random Forest MAPE:', mape(y_test, pred_rf))

plt.figure(figsize=(12,4))
plt.plot(test['timestamp'].values, y_test, label='Actual')
plt.plot(test['timestamp'].values, pred_rf, label='RF Predicted')
plt.legend()
plt.xlabel('Time')
plt.ylabel('Energy')
plt.title('Actual vs RF Predicted (Test set)')
plt.tight_layout()
plt.show()


## 7) Advanced Model: LSTM (sequence-to-one forecasting)

We will create sequences using previous `seq_hours` hours to predict the next hour.
This section requires `tensorflow`/`keras`. If your environment doesn't have tensorflow, you can skip LSTM and rely on RandomForest / XGBoost.
If tensorflow is not installed, run `pip install tensorflow` in your environment.

In [ ]:
# LSTM modeling (make this optional depending on environment)
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    print('TensorFlow version:', tf.__version__)
    TF_AVAILABLE = True
except Exception as e:
    print('TensorFlow not available in this environment. Skipping LSTM. Error:', e)
    TF_AVAILABLE = False

if TF_AVAILABLE:
    seq_hours = 48
    def create_sequences(X, y, seq_len=seq_hours):
        Xs, ys = [], []
        for i in range(seq_len, len(X)):
            Xs.append(X[i-seq_len:i])
            ys.append(y[i])
        return np.array(Xs), np.array(ys)

    # We'll use scaled features for LSTM
    X_all = np.vstack([X_train_scaled, X_test_scaled])
    y_all = np.hstack([y_train, y_test])
    # Create sequences on full dataset then split by index
    X_seq, y_seq = create_sequences(X_all, y_all, seq_len=seq_hours)
    split_index = int(0.8 * len(X_seq))
    X_seq_train, X_seq_test = X_seq[:split_index], X_seq[split_index:]
    y_seq_train, y_seq_test = y_seq[:split_index], y_seq[split_index:]
    print('LSTM input shapes:', X_seq_train.shape, X_seq_test.shape)

    model = Sequential()
    model.add(LSTM(64, input_shape=(X_seq_train.shape[1], X_seq_train.shape[2]), return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    
    # Train
    history = model.fit(X_seq_train, y_seq_train, epochs=10, batch_size=64, validation_data=(X_seq_test, y_seq_test))

    # Predict
    pred_lstm = model.predict(X_seq_test).flatten()
    print('LSTM RMSE:', rmse(y_seq_test, pred_lstm))
    print('LSTM MAPE:', mape(y_seq_test, pred_lstm))

    # Plot training loss
    plt.figure(figsize=(8,4))
    plt.plot(history.history['loss'], label='train_loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.title('LSTM Training Loss')
    plt.legend()
    plt.show()
else:
    print('LSTM section skipped. Install tensorflow to run this cell.')


## 8) Save best model & artifacts
Save scaler and Random Forest (or best model). For submission keep model files out of the notebook (or compress) to stay under 10 MB.


In [ ]:
# Save scaler and RF model to disk
os.makedirs('./artifacts', exist_ok=True)
joblib.dump(scaler, './artifacts/scaler.joblib')
joblib.dump(rf, './artifacts/random_forest.joblib')
print('Saved scaler and Random Forest to ./artifacts')


## 9) Conclusion & How to Replace with Your Real Dataset

1. Replace `./data/energy_consumption.csv` with your file and rerun the notebook.  
2. Ensure your datetime column is called `timestamp` (or adjust the code).  
3. Make sure `energy` is the target column name or change `target = 'energy'`.  
4. If using a large dataset, consider downsampling or using a sample for the notebook to keep <10MB.


----
End of notebook. Replace author details and add your README in the repo.
