In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
def training_data_split(X_observed_clean, X_estimated_clean_mean):
    X_train_estimated = X_estimated_clean_mean[:int(X_estimated_clean_mean.shape[0] * 3 / 4)]
    X_valid = X_estimated_clean_mean[int(X_estimated_clean_mean.shape[0] * 3 / 4):int(X_estimated_clean_mean.shape[0] * 9 / 10)]
    X_test = X_estimated_clean_mean[int(X_estimated_clean_mean.shape[0] * 9 / 10):]

    X_train = pd.concat([X_observed_clean, X_train_estimated])
    return X_train, X_valid, X_test

#A function which takes the mean out of every 4th column and saves it on the time on the time of the 4th. Makes it so it is every hour.
#TODO: Should be swapped for Gustavs code!
def mean_df(df):
    # Assuming df is your DataFrame and 'date_forecast' is your date column
    # Making a copy of the DataFrame to avoid modifying the original data
    df_copy = df.copy()

    # Step 1: Keeping every 4th row in the date column
    date_column = df_copy['date_forecast'].iloc[::4]

    # Step 2: Creating a grouping key
    grouping_key = np.floor(np.arange(len(df_copy)) / 4)

    # Step 3: Group by the key and calculate the mean, excluding the date column
    averaged_data = df_copy.drop(columns=['date_forecast']).groupby(grouping_key).mean()

    # Step 4: Reset index and merge the date column
    averaged_data.reset_index(drop=True, inplace=True)
    averaged_data['date_forecast'] = date_column.values
    return averaged_data

#Removes all features from a df except selected_features
def clean_df(df, selected_features):
    return df[selected_features]

  #Scales all the feature value in a way they take a simmilar range
def scale_df(df):
    scaler = StandardScaler()
    df = scaler.fit_transform(df)
    return df

#Function which resizes the training data such that only the rows with the same date and time for weather is kept.
#X_train is either observed or forcasted weather and y_train is how much energy is produced.
#y_features are a list containing the column names of y_train
#X_date_feature is the feature name which the date and time for the weather is savew. This will probably always be "date_forecast" and may be changed
def resize_training_data(X_train, y_train):
    y_features = y_train.columns.tolist()
    X_date_feature = "date_forecast"

    merged = pd.merge(X_train, y_train,left_on=X_date_feature, right_on='time', how='inner')
    y_train_resized = merged[y_features]
    columns_to_drop = y_features + [X_date_feature]
    X_train_resized = merged.drop(columns = columns_to_drop)
    return X_train_resized, y_train_resized

# Creating sequences
def create_sequences(data, target_column, window_size):
    sequences = []
    targets = []
    for i in range(len(data) - window_size):
        sequences.append(data[i:i+window_size].drop(columns=[target_column]).values)
        targets.append(data.iloc[i+window_size][target_column])
    return np.array(sequences), np.array(targets)

def fillna_mean_above_below(series):
    """
    This function takes a pandas Series, checks each element, and if it finds a NaN,
    it replaces it with the mean of the values immediately above and below it.
    """
    for i in range(1, len(series)-1):
        if pd.isna(series[i]):
            series[i] = np.mean([series[i-1], series[i+1]])
    return series

In [None]:
y = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/train_targets.parquet')
X_estimated= pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/X_train_estimated.parquet')
X_observed = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/X_train_observed.parquet')
X_test_real = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/X_test_estimated.parquet')
selected_features = ["direct_rad:W", "clear_sky_rad:W" , "diffuse_rad:W","is_in_shadow:idx", "relative_humidity_1000hPa:p"]

In [None]:
selected_features = ["date_forecast", "direct_rad:W", "clear_sky_rad:W" , "diffuse_rad:W","is_in_shadow:idx", "relative_humidity_1000hPa:p"]

y = y.dropna()

X_estimated_clean = clean_df(X_estimated, selected_features)
X_observed_clean = clean_df(X_observed, selected_features)
X_test_real_clean = clean_df(X_test_real, selected_features)
X_observed_clean_mean = mean_df(X_observed_clean)
X_estimated_clean_mean = mean_df(X_estimated_clean)
X_test_real_clean_mean = mean_df(X_test_real_clean)

X_train = pd.concat([X_observed_clean_mean, X_estimated_clean_mean])

X_train, y_train = resize_training_data(X_train, y)

In [None]:
# Merging features and targets
df = X_train
df['pv_measurement'] = y_train['pv_measurement'].values

#df = df.apply(fillna_mean_above_below)
# Normalizing
#scaler = MinMaxScaler()
#df[df.columns] = scaler.fit_transform(df[df.columns])

In [None]:
window_size = 24  # for example, one day if data is hourly
X, y = create_sequences(df, 'pv_measurement', window_size)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

model = Sequential()
number_of_timesteps = 24
number_of_features = 5
# Adding the input layer and the RNN layer
# units = number of neurons, return_sequences=True if you will add more RNN layers, input_shape=(number_of_timesteps, number_of_features)
model.add(SimpleRNN(units=50, return_sequences=True, input_shape=(number_of_timesteps, number_of_features)))

# Adding more RNN layers if necessary
model.add(SimpleRNN(units=50, return_sequences=True))
model.add(SimpleRNN(units=50))

# Adding the output layer
model.add(Dense(units=1))  # Change units according to your problem, and you might need to change the activation function


In [None]:
model.compile(optimizer='adam', loss='mean_absolute_error')  # Change optimizer and loss function according to your problem

In [None]:
model.fit(X_train, y_train, epochs=20, batch_size=32)  # Change epochs and batch_size according to your problem

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7b42b0c3c430>

In [None]:
model.evaluate(X_test, y_test)



29.04537582397461

In [None]:
pred = model.predict(X_test)



In [None]:
model.save("RNN_C")

In [None]:
from sklearn.metrics import mean_absolute_error
pred[pred < 5] = 0
mae = mean_absolute_error(y_test, pred)
print("A: Mean Absolute Error:", mae)

A: Mean Absolute Error: 29.051217973712774


In [171]:
from tensorflow.keras.models import load_model
model_a = load_model("RNN")
model_b = load_model("RNN_B")
model_c = load_model("RNN_C")

In [175]:
X_test = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/X_test_estimated.parquet')
X_test = clean_df(X_test, selected_features)
X_test = mean_df(X_test)
X_to_predict = X_test_real_clean_mean.drop(columns=["date_forecast"])

In [176]:
def create_sequences(df):
    sequences = []
    sequence_length = 24  # You want sequences of length 24

    # Padding the start of the DataFrame with (sequence_length-1) copies of the first row
    padded_df = pd.concat([pd.DataFrame([df.iloc[0]] * (sequence_length-1), columns=df.columns), df], ignore_index=True)

    # Now, every row in the original DataFrame will be the end of a 24-row sequence in the result
    for end_idx in range(sequence_length-1, len(padded_df)):
        sequences.append(padded_df.iloc[end_idx-sequence_length+1:end_idx+1].values)

    return np.array(sequences)  # Returning a NumPy array of shape (720, 24, 5)

# Assuming `your_dataframe` is your DataFrame
sequences = create_sequences(X_to_predict)


In [177]:
predictions = [model_c.predict(np.array([seq])) for seq in sequences]



In [178]:
np.save("X_test_C.npy", predictions)