In [14]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

In [15]:
df_train = pd.read_csv("./../data/preprocessed/train.csv")
df_test = pd.read_csv("./../data/preprocessed/test.csv")

df_train_lag = pd.read_csv("./../data/preprocessed/train_lag.csv")
df_test_lag = pd.read_csv("./../data/preprocessed/test_lag.csv")

In [16]:
target = "Energy_ktoe"

cols_to_drop = ["Year", "End_Use","Energy_ktoe"]

In [17]:
feature_cols = [col for col in df_train.columns if col not in cols_to_drop]

feature_cols_lag = [col for col in df_train_lag.columns if col not in cols_to_drop]

print(feature_cols)
print(feature_cols_lag)

['Population', 'Households', 'Inhabitants per household', 'HDD', 'CDD', 'GDP', 'Expenditure', 'Tech_Advanced electric heating', 'Tech_Air conditioning', 'Tech_Biomass', 'Tech_Conventional electric heating', 'Tech_Diesel oil', 'Tech_Distributed heat', 'Tech_Electricity', 'Tech_Electricity in circulation', 'Tech_Geothermal', 'Tech_Liquified petroleum gas (LPG)', 'Tech_Natural gas', 'Tech_Solar', 'Tech_Solids']
['Population', 'Households', 'Inhabitants per household', 'HDD', 'CDD', 'GDP', 'Expenditure', 'Energy_Lag1', 'Tech_Advanced electric heating', 'Tech_Air conditioning', 'Tech_Biomass', 'Tech_Conventional electric heating', 'Tech_Diesel oil', 'Tech_Distributed heat', 'Tech_Electricity', 'Tech_Electricity in circulation', 'Tech_Geothermal', 'Tech_Liquified petroleum gas (LPG)', 'Tech_Natural gas', 'Tech_Solar', 'Tech_Solids']


In [18]:
#First benchmark model
## No lag data

lr = LinearRegression()
lr.fit(df_train[feature_cols], df_train[target])
lr_preds = lr.predict(df_test[feature_cols])
mape_lr = mean_absolute_percentage_error(df_test[target], lr_preds)

mape_lr

9.143041857654001

In [19]:
#Seccodn benchmark model
## Including lag data

lr_lag = LinearRegression()
lr_lag.fit(df_train_lag[feature_cols_lag], df_train_lag[target])
lr_preds_lag = lr_lag.predict(df_test_lag[feature_cols_lag])
mape_lr_lag = mean_absolute_percentage_error(df_test_lag[target], lr_preds_lag)

mape_lr_lag

4.640848296665382

In [20]:
#Third benchmark model
## No lag data

dt = DecisionTreeRegressor(max_depth=5, random_state=42)
dt.fit(df_train[feature_cols], df_train[target])
pred_dt = dt.predict(df_test[feature_cols])
mape_dt = mean_absolute_percentage_error(df_test[target], pred_dt)

mape_dt

9.594744416867794

In [21]:
#Fourth benchmark model
## Including lag data

dt_lag = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_lag.fit(df_train_lag[feature_cols_lag], df_train_lag[target])
pred_dt_lag = dt_lag.predict(df_test_lag[feature_cols_lag])
mape_dt_lag = mean_absolute_percentage_error(df_test_lag[target], pred_dt_lag)

mape_dt_lag

1.1960000623753406

In [22]:
#Fifth benchmark model
## No lag data

xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=3,
    random_state=42,
    n_jobs=-1
)

xgb.fit(df_train[feature_cols], df_train[target])
pred_xgb = xgb.predict(df_test[feature_cols])
mape_xgb = mean_absolute_percentage_error(df_test[target], pred_xgb)

mape_xgb

7.792444034156451

In [23]:
#Sixth benchmark model
## Including lag data

xgb_lag = XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=3,
    random_state=42,
    n_jobs=-1
)

xgb_lag.fit(df_train_lag[feature_cols_lag], df_train_lag[target])
pred_xgb_lag = xgb_lag.predict(df_test_lag[feature_cols_lag])
mape_xgb_lag = mean_absolute_percentage_error(df_test_lag[target], pred_xgb_lag)

mape_xgb_lag

1.1021809044802162