In [2]:
import pandas as pd
df = pd.read_csv("wind_farm")
df.head()

Unnamed: 0,Wind speed (m/s),Wind direction (°),Nacelle position (°),Energy Export (kWh),Ambient temperature (°C),Production Factor,Month,Quarter,wind_cat,Blade angle (pitch) (°),Power (kW),Nacelle position in relation to wind direction (°),wind_direction_cat
0,7.343056,192.267168,192.236379,275904.0,4.609568,1.043885,January,1,high wind,357.740358,1277.312961,-0.030789,SW
1,3.92392,248.018972,259.760802,46150.0,2.771605,1.022813,January,1,low wind,2.144051,213.654321,11.74183,SW
2,8.693133,106.338927,106.561752,387574.0,3.12037,0.961292,January,1,very high wind,358.503509,1794.331558,0.222825,SE
3,8.710494,58.830084,58.915881,406504.0,3.043981,1.006229,January,1,very high wind,357.841671,1881.943594,0.085797,NE
4,7.415895,30.593636,30.651821,263468.0,-0.339506,1.007293,January,1,high wind,357.656021,1219.766822,0.058185,NE


# X and y

In [3]:
df.columns

Index(['Wind speed (m/s)', 'Wind direction (°)', 'Nacelle position (°)',
       'Energy Export (kWh)', 'Ambient temperature (°C)', 'Production Factor',
       'Month', 'Quarter', 'wind_cat', 'Blade angle (pitch) (°)', 'Power (kW)',
       'Nacelle position in relation to wind direction (°)',
       'wind_direction_cat'],
      dtype='object')

In [4]:
target_var = 'Energy Export (kWh)'
X = df.drop(columns=[target_var, 'Wind direction (°)','Nacelle position (°)', 'Production Factor', 
                     'Quarter', 'Blade angle (pitch) (°)', 'Power (kW)',
                    'Nacelle position in relation to wind direction (°)',
                    'wind_cat'])
y = df[target_var]

In [5]:
X

Unnamed: 0,Wind speed (m/s),Ambient temperature (°C),Month,wind_direction_cat
0,7.343056,4.609568,January,SW
1,3.923920,2.771605,January,SW
2,8.693133,3.120370,January,SE
3,8.710494,3.043981,January,NE
4,7.415895,-0.339506,January,NE
...,...,...,...,...
1091,12.215123,4.521605,December,SE
1092,8.858356,3.274319,December,SE
1093,7.583410,3.888889,December,NE
1094,6.797454,2.436728,December,NE


# Preprocessing | get dummies

In [6]:
X = pd.get_dummies(X, columns=["Month","wind_direction_cat"])

# Train test split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train with different regressors

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [9]:
regressors = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    SVR(),
    KNeighborsRegressor()]

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [11]:
from sklearn.metrics import r2_score

In [12]:
for algo in regressors:
    
    pipe = make_pipeline(StandardScaler(), algo)
    
    grid = GridSearchCV(pipe, param_grid={}, cv=10, scoring="r2")
    
    grid.fit(X_train, y_train)
    
    y_train_pred = grid.predict(X_train)
    
    print(f"Algo: {algo} | R2-score: {r2_score(y_train, y_train_pred)}")

Algo: LinearRegression() | R2-score: 0.9378755653052471
Algo: DecisionTreeRegressor() | R2-score: 1.0
Algo: RandomForestRegressor() | R2-score: 0.9942225227544579
Algo: SVR() | R2-score: -0.04455748937845749
Algo: KNeighborsRegressor() | R2-score: 0.9066060571538616


# Hyperparameter tuning

- RandomForest
- KNN

In [13]:
# RandomForest
param_grid_rf = {"randomforestregressor__min_samples_leaf":[3],
                "randomforestregressor__max_depth":[25],
                    "randomforestregressor__n_estimators":[500]}

pipe = make_pipeline(StandardScaler(), RandomForestRegressor())
grid_rf = GridSearchCV(pipe, param_grid=param_grid_rf, cv=5, scoring="r2")
grid_rf.fit(X_train, y_train)
y_train_pred = grid_rf.predict(X_train)
print(f"R2 score (train):  {r2_score(y_train, y_train_pred)} | Algo: RandomForest")
print(f"Best params: {grid_rf.best_params_} | Algo: RandomForest")

R2 score (train):  0.9835758483170363 | Algo: RandomForest
Best params: {'randomforestregressor__max_depth': 25, 'randomforestregressor__min_samples_leaf': 3, 'randomforestregressor__n_estimators': 500} | Algo: RandomForest


In [14]:
# KNN

param_grid_knn = {"kneighborsregressor__n_neighbors":[1,2,3, 4, 5, 6, 11],
                "kneighborsregressor__metric":["euclidean", "manhattan"]}

pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
grid_knn = GridSearchCV(pipe, param_grid=param_grid_knn, cv=5, scoring="r2")
grid_knn.fit(X_train, y_train)
y_train_pred_knn = grid_knn.predict(X_train)
print(f"R2 score (train):  {r2_score(y_train, y_train_pred_knn)} | Algo: KNeighborsRegressor")
print(f"Best params: {grid_knn.best_params_} | Algo: KNeighborsRegressor")

R2 score (train):  0.9671341935613972 | Algo: KNeighborsRegressor
Best params: {'kneighborsregressor__metric': 'euclidean', 'kneighborsregressor__n_neighbors': 2} | Algo: KNeighborsRegressor


# Evaluate on test data

In [15]:
from sklearn.metrics import r2_score

In [16]:
# RandomForest

r2_score(y_test, grid_rf.predict(X_test))

0.9582006310819738

In [17]:
# KNN

r2_score(y_test, grid_knn.predict(X_test))

0.8964886830082892

In [18]:
# LinearRegression

pipe = make_pipeline(StandardScaler(), LinearRegression())
    
grid_linreg = GridSearchCV(pipe, param_grid={}, cv=10, scoring="r2")
grid_linreg.fit(X_train, y_train)

r2_score(y_test, grid_linreg.predict(X_test))

0.9412535457157336