In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import statistics
from sklearn.metrics import mean_absolute_error

In [None]:
# Setup rainfall 

rainDf1 = pd.read_csv('./2020/rain/Kemijärvi-lentokenttä-rain-2020.csv', usecols=['Vuosi', 'Kk', 'Pv', 'Sademäärä (mm)'])
rainDf2 = pd.read_csv('./2021/rain/Kemijärvi-lentokenttä-rain-2021.csv', usecols=['Vuosi', 'Kk', 'Pv', 'Sademäärä (mm)'])
rainDf = pd.concat([rainDf1, rainDf2], ignore_index=True)
rainDf.columns = ['Year', 'Month', 'Date', 'Rainfall (mm)']
rainDf = rainDf.replace({'Rainfall (mm)': -1.0}, 0.0)

kokemäki1 = pd.read_csv('./2020/rain/Kokemäki-Rausenkulma-rain-2020.csv', usecols=['Sademäärä (mm)'])
kokemäki2 = pd.read_csv('./2021/rain/Kokemäki-Rausenkulma-rain-2021.csv', usecols=['Sademäärä (mm)'])
kokemäki = pd.concat([kokemäki1, kokemäki2], ignore_index=True)
kokemäki = kokemäki.replace(-1.0, 0.0)
kokemäki.columns = ['Kokemäki-Rausenkulma']

oulunsalo1 = pd.read_csv('./2020/rain/Oulu-Oulunsalo-Pellonpää-rain-2020.csv', usecols=['Sademäärä (mm)'])
oulunsalo2 = pd.read_csv('./2021/rain/Oulu-Oulunsalo-Pellonpää-rain-2021.csv', usecols=['Sademäärä (mm)'])
oulunsalo = pd.concat([oulunsalo1, oulunsalo2], ignore_index=True)
oulunsalo = oulunsalo.replace(-1.0, 0.0)
oulunsalo.columns = ['Oulu-Oulunsalo-Pellonpää']

rovaniemi1 = pd.read_csv('./2020/rain/Rovaniemi-lentoasema-rain-2020.csv', usecols=['Sademäärä (mm)'])
rovaniemi2 = pd.read_csv('./2021/rain/Rovaniemi-lentoasema-rain-2021.csv', usecols=['Sademäärä (mm)'])
rovaniemi = pd.concat([rovaniemi1, rovaniemi2], ignore_index=True)
rovaniemi = rovaniemi.replace(-1.0, 0.0)
rovaniemi.columns = ['Rovaniemi-lentoasema']

ruokolahti1 = pd.read_csv('./2020/rain/Ruokolahti-Kotaniemi-rain-2020.csv', usecols=['Sademäärä (mm)'])
ruokolahti2 = pd.read_csv('./2021/rain/Ruokolahti-Kotaniemi-rain-2021.csv', usecols=['Sademäärä (mm)'])
ruokolahti = pd.concat([ruokolahti1, ruokolahti2], ignore_index=True)
ruokolahti = ruokolahti.replace(-1.0, 0.0)
ruokolahti.columns = ['Ruokolahti-Kotaniemi']


rainDf = rainDf.join(kokemäki).join(oulunsalo).join(rovaniemi).join(ruokolahti)

rainDf['Rainfall avg (mm)'] = rainDf[['Rainfall (mm)', 'Kokemäki-Rausenkulma', 'Oulu-Oulunsalo-Pellonpää', 'Rovaniemi-lentoasema', 'Ruokolahti-Kotaniemi']].mean(axis=1)

rainDf = rainDf[['Year', 'Month', 'Date', 'Rainfall avg (mm)']]
rainDf


In [None]:
# Setup windspeed

windDf1 = pd.read_csv('./2020/wind/Kalajoki-Ulkokalla-wind-2020.csv')
windDf2 = pd.read_csv('./2021/wind/Kalajoki-Ulkokalla-wind-2021.csv')
windDf = pd.concat([windDf1, windDf2], ignore_index=True)
mean = windDf.groupby(['Vuosi', 'Kk', 'Pv'], as_index=False).mean()

windDf = mean.rename(columns= {'Vuosi': 'Year', 'Kk': 'Month', 'Pv': 'Date' , 'Tuulen nopeus (m/s)': 'Windspeed (m/s)'})

kemi1 = pd.read_csv('./2020/wind/Kemi-Tornio-lentoasema-wind-2020.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
kemi2 = pd.read_csv('./2021/wind/Kemi-Tornio-lentoasema-wind-2021.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
kemi = pd.concat([kemi1, kemi2], ignore_index=True)
kemi = kemi.rename(columns= {'Tuulen nopeus (m/s)': 'Kemi-Tornio-lentoasema'})['Kemi-Tornio-lentoasema']


# oulu = pd.read_csv('./2020/wind/Oulu-lentoasema-wind-2020.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
# oulu = oulu.rename(columns= {'Tuulen nopeus (m/s)': 'Oulu-lentoasema'})['Oulu-lentoasema']

oulu1 = pd.read_csv('./2020/wind/Oulu-lentoasema-wind-2020.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
oulu2 = pd.read_csv('./2021/wind/Oulu-lentoasema-wind-2021.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
oulu = pd.concat([oulu1, oulu2], ignore_index=True)
oulu = oulu.rename(columns= {'Tuulen nopeus (m/s)': 'Oulu-lentoasema'})['Oulu-lentoasema']

# pori = pd.read_csv('./2020/wind/Pori-lentoasema-wind-2020.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
# pori = pori.rename(columns= {'Tuulen nopeus (m/s)': 'Pori-lentoasema'})['Pori-lentoasema']

pori1 = pd.read_csv('./2020/wind/Pori-lentoasema-wind-2020.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
pori2 = pd.read_csv('./2021/wind/Pori-lentoasema-wind-2021.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
pori = pd.concat([pori1, pori2], ignore_index=True)
pori = pori.rename(columns= {'Tuulen nopeus (m/s)': 'Pori-lentoasema'})['Pori-lentoasema']

# vaasa = pd.read_csv('./2020/wind/Vaasa-lentoasema-wind-2020.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
# vaasa = vaasa.rename(columns= {'Tuulen nopeus (m/s)': 'Vaasa-lentoasema'})['Vaasa-lentoasema']

vaasa1 = pd.read_csv('./2020/wind/Vaasa-lentoasema-wind-2020.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
vaasa2 = pd.read_csv('./2021/wind/Vaasa-lentoasema-wind-2021.csv').groupby(['Kk', 'Pv'], as_index=False).mean()
vaasa = pd.concat([vaasa1, vaasa2], ignore_index=True)
vaasa = vaasa.rename(columns= {'Tuulen nopeus (m/s)': 'Vaasa-lentoasema'})['Vaasa-lentoasema']

windDf = windDf.join(kemi).join(oulu).join(pori).join(vaasa)

windDf['Windspeed avg (m/s)'] = windDf[['Windspeed (m/s)', 'Kemi-Tornio-lentoasema', 'Oulu-lentoasema', 'Pori-lentoasema', 'Vaasa-lentoasema']].mean(axis=1)

windDf = windDf[['Year', 'Month', 'Date', 'Windspeed avg (m/s)']]
windDf


In [None]:
# Setup temp
tempDf1 = pd.read_csv('./2020/temp/Helsinki-Kaisaniemi-temp-2020.csv', usecols=['Ilman lämpötila (degC)'])
tempDf2 = pd.read_csv('./2021/temp/Helsinki-Kaisaniemi-temp-2021.csv', usecols=['Ilman lämpötila (degC)'])
tempDf = pd.concat([tempDf1, tempDf2], ignore_index=True)
tempDf = tempDf.rename(columns={'Ilman lämpötila (degC)': 'Temp avg (degC)'})
tempDf

In [None]:
# Setup price
priceDf1 = pd.read_csv('./2020/price/elspot-prices_2020_daily_eur.csv', usecols=['FI'])
priceDf2 = pd.read_csv('./2021/price/elspot-prices_2021_daily_eur.csv', usecols=['FI'])

# Drop first day and move index back
priceDf1 = priceDf1.iloc[1:, : ].reset_index(drop=True)
priceDf2 = priceDf2.iloc[1:, : ].reset_index(drop=True)

# Manually add value for first day of 2021
priceDf1.loc[365] = '26,25'
priceDf2.loc[365] = '82,02'

priceDf = pd.concat([priceDf1, priceDf2], ignore_index=True)

priceDf = priceDf.apply(lambda x: x.str.replace(',', '.'))
priceDf = priceDf.rename(columns={'FI': 'Following day Elspot price'})
priceDf

In [None]:
# Course exercises and examples are utilized for below code

In [None]:
df = rainDf.join(windDf['Windspeed avg (m/s)']).join(tempDf['Temp avg (degC)']).join(priceDf['Following day Elspot price'])
df['Following day Elspot price'] = pd.to_numeric(df['Following day Elspot price'])
df

In [None]:
y = df['Following day Elspot price'].to_numpy()
X = df[['Rainfall avg (mm)', 'Windspeed avg (m/s)', 'Temp avg (degC)']].to_numpy().reshape(-1,3)

X_test, X_rem, y_test, y_rem = train_test_split(X, y, train_size=0.1, random_state=44)

X_train, X_val, y_train, y_val = train_test_split(X_rem, y_rem, train_size=0.8, random_state=44)

In [None]:
# Huber

huber_regr = HuberRegressor(epsilon=1.0)

huber_regr.fit(X_train, y_train)

y_pred_train = huber_regr.predict(X_train)
tr_error = mean_absolute_error(y_train, y_pred_train)
y_pred_val = huber_regr.predict(X_val)
val_error = mean_absolute_error(y_val ,y_pred_val)

print(tr_error)
print(val_error)

In [None]:
# Huber

huber_regr = HuberRegressor(epsilon=1.0)

huber_regr.fit(X_train, y_train)

y_pred_train = huber_regr.predict(X_train)
tr_error = mean_absolute_error(y_train, y_pred_train)
y_pred_val = huber_regr.predict(X_val)
val_error = mean_absolute_error(y_val ,y_pred_val)

print(tr_error)
print(val_error)

In [None]:
# Defining the kfold object we will use for cross validation
k, shuffle, seed = 3, True, 44
kfold = KFold(n_splits=k, shuffle=shuffle, random_state=seed)

In [None]:
degrees = [1, 2, 3, 4, 5, 6,7,8,9,10]    


tr_errors = {}          
val_errors = {}

for i, degree in enumerate(degrees):
    tr_errors[degree] = []
    val_errors[degree] = []

    for j, (train_indices, val_indices) in enumerate(kfold.split(X_rem)):  
        
        X_train, y_train, X_val, y_val = X_rem[train_indices], y_rem[train_indices], X_rem[val_indices], y_rem[val_indices]

        lin_regr = LinearRegression()
        poly = PolynomialFeatures(degree=degree)
        X_train_poly = poly.fit_transform(X_train)
        lin_regr.fit(X_train_poly, y_train)

        y_pred_train = lin_regr.predict(X_train_poly)
        tr_error = mean_absolute_error(y_train, y_pred_train)
        X_val_poly = poly.transform(X_val)
        y_pred_val = lin_regr.predict(X_val_poly)
        val_error = mean_absolute_error(y_val, y_pred_val)
        
        tr_errors[degree].append(tr_error)
        val_errors[degree].append(val_error)

    tr_errors[degree].append(statistics.mean(tr_errors[degree]))
    val_errors[degree].append(statistics.mean(val_errors[degree]))
val_errors

In [None]:
# Huber testing

y_pred_test = huber_regr.predict(X_test)
test_error = mean_absolute_error(y_test ,y_pred_test)

print(test_error)