In [31]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from google.colab import drive

In [32]:
def training_data_split(X_observed_clean, X_estimated_clean_mean):
    X_train_estimated = X_estimated_clean_mean[:int(X_estimated_clean_mean.shape[0] * 3 / 4)]
    X_valid = X_estimated_clean_mean[int(X_estimated_clean_mean.shape[0] * 3 / 4):int(X_estimated_clean_mean.shape[0] * 9 / 10)]
    X_test = X_estimated_clean_mean[int(X_estimated_clean_mean.shape[0] * 9 / 10):]

    X_train = pd.concat([X_observed_clean, X_train_estimated])
    return X_train, X_valid, X_test

#A function which takes the mean out of every 4th column and saves it on the time on the time of the 4th. Makes it so it is every hour.
#TODO: Should be swapped for Gustavs code!
def mean_df(df):
    # Assuming df is your DataFrame and 'date_forecast' is your date column
    # Making a copy of the DataFrame to avoid modifying the original data
    df_copy = df.copy()

    # Step 1: Keeping every 4th row in the date column
    date_column = df_copy['date_forecast'].iloc[::4]

    # Step 2: Creating a grouping key
    grouping_key = np.floor(np.arange(len(df_copy)) / 4)

    # Step 3: Group by the key and calculate the mean, excluding the date column
    averaged_data = df_copy.drop(columns=['date_forecast']).groupby(grouping_key).mean()

    # Step 4: Reset index and merge the date column
    averaged_data.reset_index(drop=True, inplace=True)
    averaged_data['date_forecast'] = date_column.values
    return averaged_data

#Removes all features from a df except selected_features
def clean_df(df, selected_features):
    return df[selected_features]

  #Scales all the feature value in a way they take a simmilar range
def scale_df(df):
    scaler = StandardScaler()
    df = scaler.fit_transform(df)
    return df

#Function which resizes the training data such that only the rows with the same date and time for weather is kept.
#X_train is either observed or forcasted weather and y_train is how much energy is produced.
#y_features are a list containing the column names of y_train
#X_date_feature is the feature name which the date and time for the weather is savew. This will probably always be "date_forecast" and may be changed
def resize_training_data(X_train, y_train):
    y_features = y_train.columns.tolist()
    X_date_feature = "date_forecast"

    merged = pd.merge(X_train, y_train,left_on=X_date_feature, right_on='time', how='inner')
    y_train_resized = merged[y_features]
    columns_to_drop = y_features + [X_date_feature]
    X_train_resized = merged.drop(columns = columns_to_drop)
    return X_train_resized, y_train_resized

In [33]:
selected_features = ["date_forecast", "direct_rad:W", "clear_sky_rad:W" , "diffuse_rad:W","is_in_shadow:idx", "relative_humidity_1000hPa:p"]

In [34]:
y = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/train_targets.parquet')
X_estimated= pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/X_train_estimated.parquet')
X_observed= pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/X_train_observed.parquet')
X_test_real = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/ML/C/X_test_estimated.parquet')

In [35]:
X_estimated_clean = clean_df(X_estimated, selected_features)
X_observed_clean = clean_df(X_observed, selected_features)
X_test_real_clean = clean_df(X_test_real, selected_features)
X_estimated_clean_mean = mean_df(X_estimated_clean)
X_test_real_clean_mean = mean_df(X_test_real_clean)

In [36]:
X_train, X_valid, X_test = training_data_split(X_observed_clean, X_estimated_clean_mean)

In [37]:
X_train, y_train = resize_training_data(X_train, y)
X_valid, y_valid = resize_training_data(X_valid, y)
X_test, y_test = resize_training_data(X_test, y)

import re

# Clean column names by removing special characters
X_train.columns = [re.sub(r'[\\":,{}]', '', col) for col in X_train.columns]

# **Scaling made it worse**

In [38]:
#X_train = scale_df(X_train)
#X_valid = scale_df(X_valid)
#X_test = scale_df(X_test)
#X_test_real = scale_df(X_test_real_clean.drop(columns = ["date_forecast"]))

In [39]:
"""
# Create the dataset object for LightGBM
d_train = lgb.Dataset(X_train, label=y_train["pv_measurement"])

# Define the parameters. You might want to tune these
params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 10,
    'learning_rate': 0.1,
    'feature_fraction': 0.9
}

model = lgb.train(params, d_train, 100)  # 100 is the number of boosting rounds"""

'\n# Create the dataset object for LightGBM\nd_train = lgb.Dataset(X_train, label=y_train["pv_measurement"])\n\n# Define the parameters. You might want to tune these\nparams = {\n    \'objective\': \'regression\',\n    \'metric\': \'mae\',\n    \'boosting_type\': \'gbdt\',\n    \'num_leaves\': 10,\n    \'learning_rate\': 0.1,\n    \'feature_fraction\': 0.9\n}\n\nmodel = lgb.train(params, d_train, 100)  # 100 is the number of boosting rounds'

In [40]:
"""from sklearn.metrics import mean_absolute_error

# Predicting on the test set
y_pred = model.predict(X_test)
y_pred[y_pred < 6.2] = 0
# Calculating the MAE
mae = mean_absolute_error(y_test["pv_measurement"], y_pred)
#print(y_pred)
print(f"Mean Absolute Error (MAE) on the predictions: {mae}")"""

'from sklearn.metrics import mean_absolute_error\n\n# Predicting on the test set\ny_pred = model.predict(X_test)\ny_pred[y_pred < 6.2] = 0\n# Calculating the MAE\nmae = mean_absolute_error(y_test["pv_measurement"], y_pred)\n#print(y_pred)\nprint(f"Mean Absolute Error (MAE) on the predictions: {mae}")'

In [41]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 127],
    'reg_alpha': [0.1, 0.5],
    'min_data_in_leaf': [30, 50, 100, 300, 400],
    'lambda_l1': [0, 1, 1.5],
    'lambda_l2': [0, 1]
}

# Create a LightGBM model
estimator = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse', learning_rate=0.05)

# Create the grid search object
gsearch = GridSearchCV(estimator, param_grid, cv=3)

# Fit models
gsearch.fit(X_train, y_train["pv_measurement"])

# Print the best parameters found
print(gsearch.best_params_)

#{'lambda_l1': 1.5, 'lambda_l2': 0, 'min_data_in_leaf': 400, 'num_leaves': 31, 'reg_alpha': 0.1}

Output hidden; open in https://colab.research.google.com to view.

In [42]:
best_model = gsearch.best_estimator_
predictions = best_model.predict(X_test_real_clean_mean.drop(columns = ["date_forecast"]))



In [43]:
#predictions.shape

In [44]:
np.save('X_test_c.npy', predictions)

In [45]:
#predictions = predictions.clip(min = 0, max = None)
#mae = mean_absolute_error(y_test["pv_measurement"], predictions)

#print(f"Mean Absolute Error (MAE) on the predictions: {mae}")