In [164]:
import pandas as pd
import numpy as np
from keras.src.layers import GRU, Dropout, Dense
from keras.src.optimizers import Adam
from keras import Sequential, Input
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
from matplotlib import pyplot as plt

In [165]:
df_1 = pd.read_csv("../data/processed/production_cross_data.csv")
df_2 = pd.read_csv("../data/processed/production_fossil_data.csv")
df_3 = pd.read_csv("../data/processed/production_hydro_data.csv")
df_4 = pd.read_csv("../data/processed/production_nuclear_data.csv")

dfs = [df_1, df_2, df_3, df_4]

In [166]:
df_1.head()

In [167]:
df_2.head()

In [168]:
df_3.head()

In [169]:
df_4.head()

In [170]:
df_1.isnull().sum()

In [171]:
df_2.isnull().sum()

In [172]:
df_3.isnull().sum()

In [173]:
df_4.isnull().sum()

In [174]:
for df in dfs:
    cols = df.columns.tolist()
    cols.remove("date")
    df[cols] = df[cols].bfill()

In [175]:
def create_time_series(data, n_past):
    X, y = [], []
    for i in range(n_past, len(data)):
            X.append(data[i - n_past:i, 0:data.shape[1]])
            y.append(data[i,0])
    return np.array(X),np.array(y)

In [176]:
def create_model(train_input_shape):
    model = Sequential(name='GRU')
    optimizer = Adam(learning_rate=0.01)
    
    model.add(Input(shape=(train_input_shape.shape[1], train_input_shape.shape[2])))
    model.add(GRU(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(GRU(64, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(GRU(32))
    model.add(Dense(units=32, activation="relu"))
    model.add(Dense(1))
    
    model.compile(optimizer=optimizer, loss="mean_squared_error")
    return model

In [177]:
window_size = 24

In [178]:
def select_input_data(df):
    input_data = df.columns.tolist()
    output_data = 'production'

    input_data.remove(output_data)
    input_data.remove('date')
    return input_data, output_data

In [179]:
def mutual_regression_score(df, input_data, output_data):
    ig_scores = mutual_info_regression(df[input_data], df[output_data])

    feature_scores = pd.DataFrame({'Feature': df[input_data].columns, 'Information_Gain': ig_scores})
    feature_scores = feature_scores.sort_values(by='Information_Gain', ascending=False)

    feature_scores.reset_index(drop=True, inplace=True)
    print(feature_scores.head())
    
    return feature_scores

In [180]:
def select_features(output_data, feature_scores):
    output_data = [output_data]
    for i in feature_scores['Feature'][:3].tolist():
        output_data.append(i)
        
    input_columns = output_data
    print(df[input_columns].head())
    
    filtered_df = df[input_columns]
    dataset = filtered_df.values
    return dataset, input_columns

In [181]:
def shape_data(dataset):
    test_data_size = round(0.2 * len(dataset))

    train_data = dataset[:-test_data_size]
    test_data = dataset[-test_data_size:]

    print("Train data shape:", train_data.shape)
    print("Test data shape:", test_data.shape)
    
    return train_data, test_data, test_data_size

In [182]:
def execute_training(train_data, test_data, input_columns):
    scaler = MinMaxScaler()

    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    
    X_train, y_train = create_time_series(train_data, window_size)
    X_test, y_test = create_time_series(test_data, window_size)
    
    print("Shape of X_train:", X_train.shape)
    print("Shape of X_test:", X_test.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of y_test:", y_test.shape)
    
    model = create_model(X_train)
    
    hist = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1)
    
    predicted = model.predict(X_test, verbose=0)
    
    predicted_copy_array = np.repeat(predicted, len(input_columns), axis=-1)
    print(predicted_copy_array.shape)
    
    pred = scaler.inverse_transform(np.reshape(predicted_copy_array,(len(predicted), len(input_columns))))[:, 0]
    actual_copy_array = np.repeat(y_test, len(input_columns), axis=-1)
    actual = scaler.inverse_transform(np.reshape(actual_copy_array,(len(y_test), len(input_columns))))[:, 0]
    
    return pred, actual, hist

In [183]:
def evaluate_model_performance(actual, pred, test_data_size, hist):
    mse = mean_squared_error(actual, pred)
    mae = mean_absolute_error(actual, pred)
    evs = explained_variance_score(actual, pred)
    
    print(f'MSE: {mse:.2f}')
    print(f'MAE: {mae:.2f}')
    print(f'EVS: {evs:.2f}')
    
    dates = df['date'].values

    output_df = {
        'Date': dates[-test_data_size + window_size:],
        'True': actual,
        'Predicted': pred
    }

    output_df = pd.DataFrame(output_df)
    print(output_df.head(10))
    
    line_plot(output_df.head(50))
    loss_line_plot(hist)

In [184]:
def line_plot(plot_out_df):
    plt.figure(figsize=(12, 8))
    plt.plot(plot_out_df['Date'], plot_out_df['True'], label='True')
    plt.plot(plot_out_df['Date'], plot_out_df['Predicted'], label='Predicted')
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.title('Actual vs Predicted values')
    plt.legend()
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

In [185]:
def loss_line_plot(hist):
    plt.figure(figsize=(16, 6))
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')
    plt.show()

In [186]:
for df in dfs:
    input_data, output_data = select_input_data(df)
    feature_scores = mutual_regression_score(df, input_data, output_data)
    dataset, input_columns = select_features(output_data, feature_scores)
    train_data, test_data, test_data_size = shape_data(dataset)
    pred, actual, hist = execute_training(train_data, test_data, input_columns)
    evaluate_model_performance(actual, pred, test_data_size, hist)