<a href="https://colab.research.google.com/github/sana-f-shah/Smart-Meter-Analysis/blob/main/notebooks/time_series_xgb_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error

data_dir = '/content/drive/MyDrive/Portfolio/Smart Meter Consumption/data_versions/time series'
model_dir = '/content/drive/MyDrive/Portfolio/Smart Meter Consumption/models/time series'

dataset_files = {
    'original': f'{data_dir}/original.parquet',
    'no_outliers': f'{data_dir}/no_outliers.parquet',
    'winsorized': f'{data_dir}/winsorized.parquet',
    'imputed': f'{data_dir}/imputed.parquet'
}

features = ['lag_1', 'lag_2', 'rolling_3h', 'hour', 'dayofweek', 'month', 'is_weekend']
target = 'target'

results = {
    'dataset': [],
    'rmse': [],
    'mae': [],
    'mape (%)': [],
    'median_ae': [],
    'r2': [],
    'mbe (bias)': []
}

for name, path in dataset_files.items():
    print(f'Processing dataset: {name}')
    df = pd.read_parquet(path)

    df = df.dropna(subset=features + [target])

    split_idx = int(len(df) * 0.8)
    train = df.iloc[:split_idx]
    test = df.iloc[split_idx:]

    model = XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist',
        device='cuda',
        predictor='gpu_predictor',
        n_estimators=100,
        verbosity=0,
        random_state=42
    )
    model.fit(train[features], train[target])

    model.save_model(os.path.join(model_dir, f'{name}_ts_model.json'))

    preds = model.predict(test[features])

    mse = mean_squared_error(test[target], preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test[target], preds)
    medae = median_absolute_error(test[target], preds)
    r2 = r2_score(test[target], preds)
    mbe = np.mean(preds - test[target])

    non_zero_mask = test[target] != 0
    if non_zero_mask.any():
        mape = np.mean(np.abs((test[target][non_zero_mask] - preds[non_zero_mask]) / test[target][non_zero_mask])) * 100
    else:
        mape = np.nan

    results['dataset'].append(name)
    results['rmse'].append(rmse)
    results['mae'].append(mae)
    results['mape (%)'].append(mape)
    results['median_ae'].append(medae)
    results['r2'].append(r2)
    results['mbe (bias)'].append(mbe)

val_results_df = pd.DataFrame(results)
print('\nTime Series Model Performance Summary:')
print(val_results_df)

Processing dataset: original
Processing dataset: no_outliers
Processing dataset: winsorized
Processing dataset: imputed

Time Series Model Performance Summary:
       dataset         rmse         mae   mape (%)   median_ae        r2  \
0     original   765.826104  445.863390   9.869562  309.312500  0.990863   
1  no_outliers   407.715835  311.293510   8.514958  244.443359  0.993147   
2   winsorized  1114.524191  508.569788  10.419950  311.439941  0.979326   
3      imputed   777.075882  451.546982  10.114254  314.677002  0.990655   

   mbe (bias)  
0   -4.152445  
1   -0.114327  
2    3.088484  
3   -2.307804  


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
import pandas as pd
import numpy as np
import os

test_path = '/content/drive/MyDrive/Portfolio/Smart Meter Consumption/data_versions/time series/test.parquet'
model_dir = '/content/drive/MyDrive/Portfolio/Smart Meter Consumption/models/time series/'

dataset_names = ['original', 'no_outliers', 'winsorized', 'imputed']

features = ['lag_1', 'lag_2', 'rolling_3h', 'hour', 'dayofweek', 'month', 'is_weekend']
target = 'target'

test_df = pd.read_parquet(test_path)
test_df = test_df.dropna(subset=features + [target])
X_test = test_df[features]
y_test = test_df[target]

test_results = {
    'dataset': [],
    'rmse': [],
    'mae': [],
    'mape (%)': [],
    'median_ae': [],
    'r2': [],
    'mbe (bias)': []
}

for name in dataset_names:
    print(f'Evaluating model: {name}')

    model_path = os.path.join(model_dir, f'{name}_ts_model.json')
    model = XGBRegressor()
    model.load_model(model_path)

    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    medae = median_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mbe = np.mean(y_pred - y_test)

    non_zero_mask = y_test != 0
    if non_zero_mask.any():
        mape = np.mean(np.abs((y_test[non_zero_mask] - y_pred[non_zero_mask]) / y_test[non_zero_mask])) * 100
    else:
        mape = np.nan

    test_results['dataset'].append(name)
    test_results['rmse'].append(rmse)
    test_results['mae'].append(mae)
    test_results['mape (%)'].append(mape)
    test_results['median_ae'].append(medae)
    test_results['r2'].append(r2)
    test_results['mbe (bias)'].append(mbe)

test_results_df = pd.DataFrame(test_results)
print('\nTime Series Test Set Performance Summary:')
print(test_results_df)

Evaluating model: original
Evaluating model: no_outliers
Evaluating model: winsorized
Evaluating model: imputed

Time Series Test Set Performance Summary:
       dataset         rmse          mae   mape (%)   median_ae        r2  \
0     original  1077.127488   588.804007   9.904761  311.351013  0.974340   
1  no_outliers  3927.021922  1242.285720  12.473152  320.259521  0.658930   
2   winsorized   868.674294   515.391521  10.240935  327.290527  0.983311   
3      imputed  1184.931151   593.049651  11.862866  380.849915  0.968947   

   mbe (bias)  
0  -65.972841  
1 -257.955448  
2   20.215134  
3   11.461013  


In [None]:
!pip install --quiet gspread gspread_dataframe

from google.colab import auth
from google.colab import drive
from gspread_dataframe import set_with_dataframe
import gspread
from google.auth import default
import pandas as pd

auth.authenticate_user()
drive.mount('/content/drive')
creds, _ = default()
gc = gspread.authorize(creds)

spreadsheet = gc.open("smart_meter_consumption_results")
worksheet = spreadsheet.sheet1

def insert_into_sheet(df, model_type_prefix, data_split):
    df['model_type'] = model_type_prefix
    df['data_split'] = data_split
    df['preprocessing'] = df['dataset'].astype(str)

    df = df.drop(columns=['dataset'], errors='ignore')
    if 'model' in df.columns:
        df = df.drop(columns=['model'])

    front_cols = ['model_type', 'data_split', 'preprocessing', 'r2']
    remaining_cols = [col for col in df.columns if col not in front_cols]
    df = df[front_cols + remaining_cols]
    #df = df.drop(columns=['task_type'])

    blank_row = pd.DataFrame([[''] * len(df.columns)], columns=df.columns)
    header_row = pd.DataFrame([df.columns.tolist()], columns=df.columns)
    block = pd.concat([blank_row, header_row, df], ignore_index=True)

    next_row = len(worksheet.get_all_values()) + 1
    set_with_dataframe(worksheet, block, row=next_row, col=1, include_column_header=False)

insert_into_sheet(val_results_df, model_type_prefix='time_series', data_split='val')
insert_into_sheet(test_results_df, model_type_prefix='time_series', data_split='test')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
