In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor


In [3]:
print("Loading dataset...")
df = pd.read_csv("../data/final_cleaned_dataset.csv")

print(df.shape)
df.head()

Loading dataset...
(4145150, 25)


Unnamed: 0,lclid,tstp,energy,hour,day,month,weekday,date,is_weekend,visibility,...,windspeed,preciptype,icon,humidity,summary,is_holiday,stdortou,acorn,acorn_grouped,file
0,MAC000640,2012-12-13 08:00:00,0.065,8,13,12,3,2012-12-13,0,6.26,...,1.48,snow,partly-cloudy-night,0.9,Partly Cloudy,0,ToU,ACORN-E,Affluent,block_16
1,MAC000320,2012-10-19 10:00:00,0.133,10,19,10,4,2012-10-19,0,11.78,...,1.2,rain,partly-cloudy-day,0.93,Mostly Cloudy,0,Std,ACORN-E,Affluent,block_16
2,MAC000525,2012-12-08 23:00:00,0.174,23,8,12,5,2012-12-08,1,13.84,...,4.3,rain,partly-cloudy-night,0.85,Partly Cloudy,0,ToU,ACORN-E,Affluent,block_16
3,MAC000544,2013-07-17 21:00:00,0.177,21,17,7,2,2013-07-17,0,14.39,...,1.69,rain,clear-night,0.59,Clear,0,ToU,ACORN-E,Affluent,block_16
4,MAC002202,2013-08-10 12:00:00,0.076,12,10,8,5,2013-08-10,1,12.47,...,3.98,rain,partly-cloudy-day,0.48,Partly Cloudy,0,Std,ACORN-E,Affluent,block_16


In [4]:
df_model = df.copy()

drop_cols = [
    'lclid','tstp','date','icon','summary','file'
]

for c in drop_cols:
    if c in df_model.columns:
        df_model.drop(columns=c, inplace=True)

df_model = df_model.dropna()

print(df_model.columns)

Index(['energy', 'hour', 'day', 'month', 'weekday', 'is_weekend', 'visibility',
       'windbearing', 'temperature', 'dewpoint', 'pressure',
       'apparenttemperature', 'windspeed', 'preciptype', 'humidity',
       'is_holiday', 'stdortou', 'acorn', 'acorn_grouped'],
      dtype='str')


In [5]:
df_model = pd.get_dummies(df_model, drop_first=True)
print("After encoding:", df_model.shape)


After encoding: (4145150, 39)


In [6]:
X = df_model.drop("energy", axis=1)
y = df_model["energy"]


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (3316120, 38)
Test size: (829030, 38)


In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [9]:
print("\nTraining Linear Regression...")

start = time.time()

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
pred_lr = lr.predict(X_test_scaled)

end = time.time()
lr_time = end - start

print("Linear Regression trained")



Training Linear Regression...
Linear Regression trained


In [10]:
print("\nTraining Random Forest...")

start = time.time()

rf = RandomForestRegressor(
    n_estimators=80,
    max_depth=15,
    n_jobs=-1
)

rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

end = time.time()
rf_time = end - start

print("Random Forest trained")



Training Random Forest...


KeyboardInterrupt: 