In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from xgboost import XGBRegressor

In [4]:
df = pd.read_csv('data/data-ready.csv')

targets = ["z1", "z2", "z3"]

df.drop("DateTime", inplace=True, axis=1)

# split train and test
X = df.drop(columns=targets, axis=1)
y1 = df['z1']
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)

y2 = df['z2']
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

y3 = df['z3']
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)

In [5]:
from sklearn.model_selection import RepeatedKFold, cross_val_score

# fit an xgboost model to the data
model = XGBRegressor()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X1_train, y1_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# force scores to be positive
scores = np.abs(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()))

Mean MAE: 835.390 (12.348)


In [6]:
X1_train.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,general diffuse flows,diffuse flows,Year,Month,Day,Hour,IsWeekend,IsNight,HourCos,HourSin,MonthCos,MonthSin,DayCos,DaySin
count,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0,41932.0
mean,18.813952,68.257074,1.957745,183.606466,75.203026,2017.0,6.512496,15.674068,11.49337,0.285725,0.457574,-0.003080794,0.00235228,-0.006059016,-0.005154925,-0.021557,0.001627529
std,5.794997,15.522747,2.348395,264.690512,124.193888,0.0,3.438807,8.776602,6.907445,0.451764,0.498203,0.7070757,0.7071441,0.7078056,0.7063793,0.699937,0.7138938
min,3.247,11.34,0.05,0.004,0.011,2017.0,1.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-0.994869,-0.9987165
25%,14.43,58.29,0.078,0.062,0.122,2017.0,4.0,8.0,6.0,0.0,0.0,-0.7071068,-0.7071068,-0.8660254,-0.8660254,-0.758758,-0.7247928
50%,18.81,69.85,0.086,5.762,5.0645,2017.0,7.0,16.0,11.0,0.0,0.0,-1.83697e-16,1.224647e-16,-1.83697e-16,-2.449294e-16,-0.050649,-2.449294e-16
75%,22.88,81.4,4.915,323.0,101.3,2017.0,10.0,23.0,17.0,1.0,1.0,0.7071068,0.7071068,0.5,0.5,0.688967,0.7247928
max,39.78,94.8,6.483,1163.0,936.0,2017.0,12.0,31.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9987165


### Testing Decision Trees

In [7]:
X_train = [X1_train, X2_train, X3_train]
X_test = [X1_test, X2_test, X3_test]
y_train = [y1_train, y2_train, y3_train]
y_test = [y1_test, y2_test, y3_test]

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [9]:
models = []
predictions = []
for i in range(3):
    model = DecisionTreeRegressor()
    model.fit(X_train[i], y_train[i])

    prediction = model.predict(X_test[i])
    score = r2_score(y_test[i], prediction)

    print(f"------ zone {i}: r2 score: {score}")

    models.append(model)
    predictions.append(prediction)



------ zone 0: r2 score: 0.9662874671628542
------ zone 1: r2 score: 0.9713828947307376
------ zone 2: r2 score: 0.9858790991472575


In [10]:
df["total"] = df["z1"] + df["z2"] + df["z3"]

targetT = df["total"]
XT = df.drop(columns=["z1", "z2", "z3", "total"], axis=1)


XT_train, XT_test, yT_train, yT_test = train_test_split(XT, targetT, test_size=0.2, random_state=42)

model = DecisionTreeRegressor()
model.fit(XT_train, yT_train)

prediction = model.predict(XT_test)
score = r2_score(yT_test, prediction)

print(f"------ total: r2 score: {score}")

------ total: r2 score: 0.974651392726841


### Testing Random Forests

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
models_rf = []
predictions_rf = []
for i in range(3):
    model = RandomForestRegressor()
    model.fit(X_train[i], y_train[i])

    prediction = model.predict(X_test[i])
    score = r2_score(y_test[i], prediction)

    print(f"------ zone {i}: r2 score: {score}")

    models_rf.append(model)
    predictions_rf.append(prediction)

------ zone 0: r2 score: 0.98298048315401
------ zone 1: r2 score: 0.9851269727525681
------ zone 2: r2 score: 0.9925045274490086


In [14]:
model_rf = RandomForestRegressor()
model_rf.fit(XT_train, yT_train)

prediction_rf = model_rf.predict(XT_test)
score = r2_score(yT_test, prediction_rf)

print(f"------ total: r2 score: {score}")

------ total: r2 score: 0.9869350129047934


# Next:
- cross validation
- more optimization of models + more fine tuning
- group by minute, hour, day