# Import Libraries

In [70]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import random
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import tensorflow as tf

%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

# Read Dataset

In [71]:
#Read datasets
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')
X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')
X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')
#Read test datasets
X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

# Find intervals with broken data in B

In [72]:
# Calculate the difference between consecutive measurements
train_b['diff'] = train_b['pv_measurement'].diff()

# Identify intervals where the difference is zero and the measurement is not 0.0
constant_intervals_B = train_b[(train_b['diff'] == 0) & (train_b['pv_measurement'] != 0) & (train_b['pv_measurement'].notna())]

# Group these intervals and count their lengths
grouped_intervals_B = (constant_intervals_B
                       .assign(group=(constant_intervals_B['time'].diff() != pd.Timedelta(hours=1)).cumsum())
                       .groupby('group')
                       .agg(start_time=('time', 'first'), end_time=('time', 'last'), count=('time', 'count'), constant_value=('pv_measurement', 'first'))
                       .reset_index(drop=True))

# Filter out short intervals (e.g., less than 3 hours) as they might not be considered "broken"
long_constant_intervals_B = grouped_intervals_B[grouped_intervals_B['count'] >= 3]

#long_constant_intervals_B.head(200)  # Display the first few rows of long constant intervals in B


In [73]:
def replace_with_values_from_C(train_b, train_c, constant_intervals):
    train_b_copy = train_b.copy()
    """
    Replace values in dataset B's constant intervals with corresponding values from dataset C.

    :param train_b: DataFrame for dataset B.
    :param train_c: DataFrame for dataset C.
    :param constant_intervals: DataFrame with constant intervals in B to be replaced.
    :return: Modified dataset B with values replaced from C.
    """
    for index, row in constant_intervals.iterrows():
        start_time, end_time = row['start_time'], row['end_time']

        # Iterate through each timestamp within the interval
        for time in pd.date_range(start=start_time, end=end_time, freq='H'):
            # Check if there's a corresponding timestamp in C
            if time in train_c['time'].values:
                # Replace the value in B with the value from C
                value_C = train_c.loc[train_c['time'] == time, 'pv_measurement'].values[0]
                train_b_copy.loc[train_b_copy['time'] == time, 'pv_measurement'] = value_C


    # Replace the constant 0.0 values from line 19140 to 20143 in dataset B with the values from the same timestamp in dataset C.

    # Iterate through each timestamp within the interval
    for time in pd.date_range(start='2021-03-08 15:00:00', end='2021-04-19 11:00:00', freq='H'):
        # Check if there's a corresponding timestamp in C
        if time in train_c['time'].values:
            # Replace the value in B with the value from C
            value_C = train_c.loc[train_c['time'] == time, 'pv_measurement'].values[0]
            train_b_copy.loc[train_b_copy['time'] == time, 'pv_measurement'] = value_C

    return train_b_copy

# Replace values in dataset B using dataset C for the identified constant intervals
train_b_modified = replace_with_values_from_C(train_b, train_c, long_constant_intervals_B)

display(train_b)
display(train_b_modified)

train_b = train_b_modified.drop(columns=['diff'])

# fig, axs = plt.subplots(3, 1, figsize=(20, 10), sharex=True)
# axs[0].plot(train_b_modified['time'], train_b_modified['pv_measurement'])
# axs[0].set_title('Dataset B (Modified)')
# axs[1].plot(train_c['time'], train_c['pv_measurement'])
# axs[1].set_title('Dataset C')
# axs[2].plot(train_b['time'], train_b['pv_measurement'])
# axs[2].set_title('Dataset B')
# plt.show()

Unnamed: 0,time,pv_measurement,diff
0,2018-12-31 23:00:00,0.000000,
1,2019-01-01 00:00:00,0.000000,0.000000
2,2019-01-01 01:00:00,0.000000,0.000000
3,2019-01-01 02:00:00,0.000000,0.000000
4,2019-01-01 03:00:00,0.000000,0.000000
...,...,...,...
32843,2023-04-30 19:00:00,0.828587,-44.434448
32844,2023-04-30 20:00:00,-0.000000,-0.828587
32845,2023-04-30 21:00:00,-0.000000,0.000000
32846,2023-04-30 22:00:00,-0.000000,0.000000


Unnamed: 0,time,pv_measurement,diff
0,2018-12-31 23:00:00,0.000000,
1,2019-01-01 00:00:00,0.000000,0.000000
2,2019-01-01 01:00:00,0.000000,0.000000
3,2019-01-01 02:00:00,0.000000,0.000000
4,2019-01-01 03:00:00,0.000000,0.000000
...,...,...,...
32843,2023-04-30 19:00:00,0.828587,-44.434448
32844,2023-04-30 20:00:00,-0.000000,-0.828587
32845,2023-04-30 21:00:00,-0.000000,0.000000
32846,2023-04-30 22:00:00,-0.000000,0.000000


In [74]:
train_b.head()

Unnamed: 0,time,pv_measurement
0,2018-12-31 23:00:00,0.0
1,2019-01-01 00:00:00,0.0
2,2019-01-01 01:00:00,0.0
3,2019-01-01 02:00:00,0.0
4,2019-01-01 03:00:00,0.0


# Fix metric on ceiling_height_agl:m and cloud_base_agl:m

Found that most likely the **ceiling_height_agl:m** and **cloud_base_agl:m** starts with values in meters, and then suddenly switches into using feet at 26.03.2020. Therefore I transform the values before 26.03.2020 into feet. 

In [75]:
mask_a = X_train_observed_a['date_forecast'] < '2020-03-26'
mask_b = X_train_observed_b['date_forecast'] < '2020-03-26'
mask_c = X_train_observed_c['date_forecast'] < '2020-03-26'

# Apply the conversion from meters to feet (1 meter = 3.28084 feet)
conversion_factor = 3.28084

X_train_observed_a.loc[mask_a, 'ceiling_height_agl:m'] *= conversion_factor
X_train_observed_b.loc[mask_b, 'ceiling_height_agl:m'] *= conversion_factor
X_train_observed_c.loc[mask_c, 'ceiling_height_agl:m'] *= conversion_factor

X_train_observed_a.loc[mask_a, 'cloud_base_agl:m'] *= conversion_factor
X_train_observed_b.loc[mask_b, 'cloud_base_agl:m'] *= conversion_factor
X_train_observed_c.loc[mask_c, 'cloud_base_agl:m'] *= conversion_factor

As the features have a 0.83 correlation, I choose to go forward with ceiling height

# Concat data

In [76]:

#add location to each sample
train_a["location"] = "A"
train_b["location"] = "B"
train_c["location"] = "C"
X_train_estimated_a["location"] = "A"
X_train_estimated_b["location"] = "B"
X_train_estimated_c["location"] = "C"
X_train_observed_a["location"] = "A"
X_train_observed_b["location"] = "B"
X_train_observed_c["location"] = "C"
X_test_estimated_a["location"] = "A"
X_test_estimated_b["location"] = "B"
X_test_estimated_c["location"] = "C"

#copy 23:45 value to 00:00 as it isn't there
def fill_last(frame):
    copy = frame.copy()

    copy['date_forecast'] = pd.to_datetime(copy['date_forecast'])
    
    copy["date_forecast"] = copy["date_forecast"] + pd.Timedelta(minutes=15)
    
    copy = copy[copy["date_forecast"].apply(lambda time : time.hour == 00 and time.minute == 00)]

    copy = copy[frame.columns]
    
    frame = pd.concat([
        frame,
        copy.astype(frame.dtypes)
    ],)

    frame = frame.drop_duplicates(subset="date_forecast", keep="first")

    return frame

#fill last
X_test_estimated_a = fill_last(X_test_estimated_a.copy())
X_test_estimated_b = fill_last(X_test_estimated_b.copy())
X_test_estimated_c = fill_last(X_test_estimated_c.copy())

#remove extra minute 00 sample
X_train_observed_a = X_train_observed_a.iloc[:-1,:]
X_train_observed_b = X_train_observed_b.iloc[:-1,:]
X_train_observed_c = X_train_observed_c.iloc[:-1,:]

#add date_calc column same as date_forecast column to observed data
X_train_observed_a.insert(0, "date_calc", X_train_observed_a["date_forecast"])
X_train_observed_b.insert(0, "date_calc", X_train_observed_b["date_forecast"])
X_train_observed_c.insert(0, "date_calc", X_train_observed_c["date_forecast"])

#concat all the samples and remove date_calc column
X_train_raw = pd.concat([X_train_observed_a,
                     X_train_observed_b,
                     X_train_observed_c,
                     X_train_estimated_a,
                     X_train_estimated_b,
                     X_train_estimated_c,
                     X_test_estimated_a,
                     X_test_estimated_b,
                     X_test_estimated_c])


## Clean and preprocess data

In [77]:

#map snow density to one and zero
X_train_raw["snow_density:kgm3"] = X_train_raw["snow_density:kgm3"].apply(lambda a : np.isnan(a)).map({True: 0, False: 1})

#fix ceiling_height NaN values to -666 because the docs hints to it
#also rename the features with their proper metric and remove the old ones
X_train_raw["ceiling_height_agl:ft"] = X_train_raw["ceiling_height_agl:m"].fillna(-666)
X_train_raw["cloud_base_agl:ft"] = X_train_raw["cloud_base_agl:m"].fillna(-666)
X_train_raw.drop(columns=["ceiling_height_agl:m", "cloud_base_agl:m"], inplace=True)

# #categorizing ceiling_height_agl:ft
# #found categories on google
# X_train_raw['ceiling_height_agl:ft'] = pd.cut(X_train_raw['ceiling_height_agl:ft'], bins=[float('-inf'), 0, 500, 1000, 3000, 5000, 12000, float('inf')], labels=[-666, 1, 2, 3, 4, 5, 6])

#decided to drop cloud_base_agl:ft because of high correlation to ceiling_height_agl:ft
X_train_raw.drop(columns=["cloud_base_agl:ft"], inplace=True)

# #categorizing dew_or_rime:idx
# dew_or_rime_categories = {
#     -1.0: "Rime",
#     0.0: "None",
#     1.0: "Dew"
# }
# X_train_raw['dew_or_rime:idx'] = X_train_raw['dew_or_rime:idx'].map(dew_or_rime_categories)

#casting floats to int for categorical features
X_train_raw['is_day:idx'] = X_train_raw['is_day:idx'].astype(int)
X_train_raw['dew_or:rime:idx'] = X_train_raw['dew_or_rime:idx'].astype(int)

# #remove some weird artifacts from train_b target values
# train_b = pd.concat([train_b[:18690], train_b[20142:]])
# train_b["rolling"] = train_b["pv_measurement"].rolling(4).mean()
# train_b["keep"] = train_b["pv_measurement"] - train_b["rolling"] != 0 + train_b["pv_measurement"].apply(lambda a: a==0)
# train_b = train_b[train_b["keep"]]
# train_b = train_b.iloc[:,:3]

parse_dates = ['time']
X_test_targets = pd.read_csv("test.csv", parse_dates=parse_dates)

train_a["id"] = -10
train_b["id"] = -10
train_c["id"] = -10

X_test_targets = X_test_targets.rename(columns = {"prediction" : "pv_measurement"})

targets = pd.concat([train_a,
                     train_b.astype(train_a.dtypes),
                     train_c.astype(train_a.dtypes),
                     X_test_targets.astype(train_a.dtypes)]).dropna()


## Split into timeframes

In [78]:

features00 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 0)].copy()
features00["merge_time"] = features00["date_forecast"]


features15 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 15)].copy()
features15["merge_time"] = features15["date_forecast"] + pd.Timedelta(minutes=-15)


features30 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 30)].copy()
features30["merge_time"] = features30["date_forecast"] + pd.Timedelta(minutes=-30)


features45 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 45)].copy()
features45["merge_time"] = features45["date_forecast"] + pd.Timedelta(minutes=-45)


X_train_raw["date_forecast"] = X_train_raw["date_forecast"] + pd.Timedelta(minutes = -60)
features60 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 00)].copy()
features60["merge_time"] = features60["date_forecast"]

dataset = targets
dataset = dataset.rename(columns={"time": "merge_time"})

display(X_train_raw)

Unnamed: 0,date_calc,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,clear_sky_energy_1h:J,clear_sky_rad:W,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,location,ceiling_height_agl:ft,dew_or:rime:idx
0,2019-06-02 22:00:00,2019-06-02 21:00:00,7.7,1.230,0.0,0.0,0.0,280.299988,0.0,0.0,0.0,0.0,98.699997,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1006.799988,0.0,0.0,994.200012,1000.299988,0.0,0.0,73.099998,1006.299988,0,0.0,0.0,-0.0,0.1,342.834015,-3.202,0.0,285.899994,100.000000,39640.101562,3.7,-3.6,-0.8,-0.0,A,5724.737793,0
1,2019-06-02 22:15:00,2019-06-02 21:15:00,7.7,1.229,0.0,0.0,0.0,280.299988,0.0,0.0,0.0,0.0,99.000000,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1006.500000,0.0,0.0,993.900024,999.900024,0.0,0.0,72.199997,1006.000000,0,0.0,0.0,-0.0,0.2,346.294006,-3.650,0.0,286.100006,100.000000,40123.898438,3.6,-3.6,-0.6,-0.0,A,5688.976562,0
2,2019-06-02 22:30:00,2019-06-02 21:30:00,7.7,1.228,0.0,0.0,0.0,280.299988,0.0,0.0,0.0,0.0,99.199997,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1006.099976,0.0,0.0,993.599976,999.599976,0.0,0.0,71.199997,1005.599976,0,0.0,0.0,-0.0,0.2,349.768005,-3.998,0.0,286.299988,100.000000,40628.300781,3.6,-3.6,-0.4,-0.0,A,5654.527832,0
3,2019-06-02 22:45:00,2019-06-02 21:45:00,7.7,1.226,0.0,0.0,0.0,280.299988,0.0,0.0,0.0,0.0,99.400002,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1005.799988,0.0,0.0,993.299988,999.299988,0.0,0.0,70.199997,1005.299988,0,0.0,0.0,-0.0,0.2,353.251007,-4.247,0.0,286.600006,100.000000,41153.601562,3.5,-3.5,-0.2,-0.0,A,5621.391113,0
4,2019-06-02 23:00:00,2019-06-02 22:00:00,7.7,1.225,0.0,0.0,0.0,280.299988,0.0,0.0,0.0,0.0,99.599998,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1005.500000,0.0,0.0,993.000000,999.000000,0.0,0.0,69.199997,1005.000000,0,0.0,0.0,-0.0,0.2,356.742004,-4.393,0.0,286.799988,100.000000,41699.898438,3.5,-3.5,0.0,-0.0,A,5589.238770,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2207,2023-06-18 07:00:05,2023-06-19 23:00:00,10.7,1.193,0.0,0.0,0.0,285.600006,0.0,0.0,0.0,0.0,27.799999,24.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1011.000000,0.0,0.0,995.299988,1001.200012,0.0,0.0,59.200001,1007.000000,0,0.0,0.0,-0.0,0.0,6.356000,-3.011,0.0,293.700012,43.500000,46996.800781,4.0,-2.0,3.5,-0.0,C,11394.299805,0
2303,2023-06-21 07:00:30,2023-06-22 23:00:00,9.4,1.228,0.0,0.0,0.0,283.299988,0.0,0.0,0.0,0.0,95.599998,24.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1013.900024,0.0,0.0,997.799988,1003.799988,0.0,0.0,83.199997,1009.900024,0,0.0,0.0,-0.0,0.0,6.206000,-2.996,0.0,285.399994,96.800003,33542.898438,1.3,-0.6,1.1,-0.0,C,1541.400024,0
2399,2023-06-25 07:01:23,2023-06-26 23:00:00,9.8,1.187,0.0,0.0,0.0,284.200012,0.0,0.0,0.0,0.0,2.000000,24.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1004.799988,0.0,0.0,989.200012,995.000000,0.0,0.0,53.099998,1000.799988,0,0.0,0.0,-0.0,0.0,6.015000,-3.071,0.0,295.500000,2.000000,48980.699219,3.1,-2.0,2.5,-0.0,C,-666.000000,0
2687,2023-06-29 07:00:05,2023-06-30 23:00:00,9.3,1.220,0.0,0.0,0.0,283.100006,0.0,0.0,0.0,0.0,100.000000,24.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1004.000000,0.0,0.0,989.200012,995.200012,0.0,0.1,88.800003,1001.200012,0,0.0,0.0,-0.0,0.2,5.837000,-3.254,0.1,284.399994,100.000000,9935.700195,1.8,1.6,-0.9,0.0,C,2006.300049,0


# Define helper functions

In [79]:
#averages the features meassured at target time +00, +15, +30, +45 and +60
def add_feature_average_00_60(dataset, f00, f15, f30, f45, f60, column_name, categorical=False):
    dataset = pd.merge(
        left=dataset,
        right = f00[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner")
    dataset = pd.merge(
        left=dataset,
        right = f15[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner",
        suffixes=["", "_15"])
    dataset = pd.merge(
        left=dataset,
        right = f30[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner",
        suffixes=["", "_30"])
    dataset = pd.merge(
        left=dataset,
        right = f45[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner",
        suffixes=["", "_45"])
    dataset = pd.merge(
        left=dataset,
        right = f60[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner",
        suffixes=["", "_60"])


    dataset[column_name] = (dataset[column_name] +
                            dataset[column_name + "_15"] +
                            dataset[column_name + "_30"] +
                            dataset[column_name + "_45"] +
                            dataset[column_name + "_60"])/5
    dataset = dataset.drop([column_name + "_15",
                            column_name + "_30",
                            column_name + "_45",
                            column_name + "_60"],
                           axis=1)
    if categorical:
        add_to_categorical_features(column_name)
    else:
        add_to_numerical_features(column_name)

    return dataset

#adds a single feature from one observation
def add_feature(dataset, f, column_name, count=True, categorical=False):
    if count:
        if categorical:
            add_to_categorical_features(column_name)
        else:
            add_to_numerical_features(column_name)
    return(pd.merge(
            left=dataset,
            right=f[["location", "merge_time", column_name]],
            on=["location", "merge_time"],
            how="inner")
  )

#adds an One Hot Encoding of the column to the dataset
def OHE(dataset, f, column_name, suffix=""):

    dataset = pd.merge(
        left=dataset,
        right = f[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner")

    values = dataset[column_name].unique()

    for value in values:
        dataset[column_name + "_" + suffix + str(value)] = dataset[column_name].apply(lambda a : a == value).map({True: 1, False: 0})

    dataset = dataset.drop([column_name], axis=1)
    return dataset

def OHE_all(dataset, f00, f15, f30, f45, f60, column_name):
    dataset = OHE(dataset, f00, column_name, suffix="00_")
    dataset = OHE(dataset, f15, column_name, suffix="15_")
    dataset = OHE(dataset, f30, column_name, suffix="30_")
    dataset = OHE(dataset, f45, column_name, suffix="45_")
    dataset = OHE(dataset, f60, column_name, suffix="60_")

    return dataset

#adds all observations
def add_all(dataset, f00, f15, f30, f45, f60, column_name, categorical=False):
    dataset[column_name + "_00"] = add_feature(dataset, f00, column_name, count=False,categorical=categorical)[column_name]
    dataset[column_name + "_15"] = add_feature(dataset, f15, column_name, count=False, categorical=categorical)[column_name]
    dataset[column_name + "_30"] = add_feature(dataset, f30, column_name, count=False, categorical=categorical)[column_name]
    dataset[column_name + "_45"] = add_feature(dataset, f45, column_name, count=False, categorical=categorical)[column_name]
    dataset[column_name + "_60"] = add_feature(dataset, f60, column_name, count=False, categorical=categorical)[column_name]

    feature_names = [column_name + "_00", column_name + "_15", column_name + "_30", column_name + "_45", column_name + "_60"]
    for feat in feature_names:
        if categorical:
            add_to_categorical_features(feat)
        else:
            add_to_numerical_features(feat)

    return dataset

#finds mode (typetall) for a row
def find_mode_with_priority(row, priority_list, max=False):
    # Check for prioritized values in the row and set it as the mode if it is
    for value in priority_list:
        if value in row.values:
            return value

    # Calculate the mode for the row
    if max:
        mode_value = row.mode().max()
    else:
        mode_value = row.mode().min()

    return mode_value

def add_most_frequent_feature(dataset, f15, f30, f45, f60, column_name, priority_list, max=False, categorical=True):
    dataset[column_name + "_15"] = add_feature(dataset, f15, column_name, count=False)[column_name]
    dataset[column_name + "_30"] = add_feature(dataset, f30, column_name, count=False)[column_name]
    dataset[column_name + "_45"] = add_feature(dataset, f45, column_name, count=False)[column_name]
    dataset[column_name + "_60"] = add_feature(dataset, f60, column_name, count=False)[column_name]

    feature_names = [column_name + "_15", column_name + "_30", column_name + "_45", column_name + "_60"]

    dataset[column_name] = dataset[feature_names].apply(find_mode_with_priority, args=(priority_list, max), axis=1)

    dataset = dataset.drop([column_name + "_15",
                            column_name + "_30",
                            column_name + "_45",
                            column_name + "_60"],
                           axis=1)

    if categorical:
        add_to_categorical_features(column_name)
    else:
        add_to_numerical_features(column_name)

    return dataset

def add_accumulated(dataset, f15, f30, f45, f60, column_name, time_interval, categorical=False):
    dataset[column_name + "_15"] = add_feature(dataset, f15, column_name, count=False)[column_name]
    dataset[column_name + "_30"] = add_feature(dataset, f30, column_name, count=False)[column_name]
    dataset[column_name + "_45"] = add_feature(dataset, f45, column_name, count=False)[column_name]
    dataset[column_name + "_60"] = add_feature(dataset, f60, column_name, count=False)[column_name]

    time_multiplier = 15/time_interval

    feature_names = [column_name + "_15", column_name + "_30", column_name + "_45", column_name + "_60"]

    dataset[column_name] = dataset[feature_names].sum(axis=1)*time_multiplier

    dataset = dataset.drop([column_name + "_15",
                            column_name + "_30",
                            column_name + "_45",
                            column_name + "_60"],
                           axis=1)
    if categorical:
        add_to_categorical_features(column_name)
    else:
        add_to_numerical_features(column_name)

    return dataset

def add_accumulated_all(dataset, f15, f30, f45, f60, column_name, time_interval, categorical=False):
    dataset[column_name + "_15"] = add_feature(dataset, f15, column_name)[column_name]
    dataset[column_name + "_30"] = add_feature(dataset, f30, column_name)[column_name]
    dataset[column_name + "_45"] = add_feature(dataset, f45, column_name)[column_name]
    dataset[column_name + "_60"] = add_feature(dataset, f60, column_name)[column_name]

    time_multiplier = 15/time_interval

    feature_names = [column_name + "_15", column_name + "_30", column_name + "_45", column_name + "_60"]

    for feat in feature_names:
        dataset[feat] = dataset[feat]*time_multiplier
    if categorical:
        add_to_categorical_features(column_name)    
    else:
        add_to_numerical_features(column_name)

    return dataset

numerical_feature_names = []
categorical_feature_names = []

def add_to_numerical_features(feature_name):
    numerical_feature_names.append(feature_name)

def add_to_categorical_features(feature_name):
    categorical_feature_names.append(feature_name)


# Feature info

### Features removed because of correlations:

- **fresh_snow_12h:cm**: fresh_snow_24h:cm = **0.82**

- **fresh_snow_3h:cm**: fresh_snow_1h:cm = **0.81**

- **fresh_snow_6h:cm**: fresh_snow_24h:cm = **0.83**

- **diffuse_rad:W**: diffuse_rad_1h:J = **0.99**

- **direct_rad:W**: direct_rad_1h:J = **0.99**

- **pressure_100m:hPa**: msl_pressure:hPa = **1.00**

- **pressure_50m:hPa**: msl_pressure:hPa = **1.00**

- **sfc_pressure:hPa**: msl_pressure:hPa = **1.00**

- **absolute_humidity_2m:gm3**: t1000:hPa = **0.90**

- **air_density_2m:kgm3**: t1000:hPa = **0.90**

- **dew_point_2m:K**: t1000:hPa = **0.91**

- **clear_sky_rad:W**: sun_elevation = **0.83**

- **clear_sky_energy_1h:J**: sun_elevation = **0.82**

- **total_cloud_cover:p**: effective_cloud_cover:p = **0.94**

- **cloud_base_agl:m**: ceiling_height_agl:m = **0.83** (after recalculating to feet)

- **elevation:m**: highly correlated to location

- **is_in_shadow:idx**: highly correlated to is_day:idx

### High correlations still in the set:

- **is_day:idx**: sun_elevation = **0.81**

- **diffuse_rad_1h:J**: sun_elevation = **0.80**

### Other features removed:

- **snow_drift:idx**: Almost exclusively 0, so doesn't add much data

- **wind_speed_w_1000hPa:ms**: In relation to the dummy data found in the docs this data is really weird (binary as opposed to continous values)


# Add numerical features

In [80]:
# SNOW AND PRECIPITATION

# tar verdi fra +60 siden den viser måling mellom 00 og 60, #!kan det være gunstig å ha med 3h, 6h, 12h????
# dataset = add_feature(dataset, features60, "fresh_snow_1h:cm")
# dataset = add_feature(dataset, features60, "fresh_snow_24h:cm")

# bruker bare 24h fordi den har høyest korrelasjon med pv_measurement
dataset = add_feature(dataset, features60, "fresh_snow_24h:cm")

# tar alle akkumulerte verdier og ganger med 3 for å få en bedre verdi (ikke helt etter boka menmen...)
dataset = add_all(dataset, features00, features15, features30, features45, features60, "precip_5min:mm")
# disse tar jeg bare gjennomsnittet av
#dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "snow_depth:cm")
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "snow_water:kgm2")
# tar akkumulert verdi og ganger med 3/2 for å få en bedre verdi (ikke helt etter boka menmen...)
#dataset = add_accumulated(dataset, features15, features30, features45, features60, "snow_melt_10min:mm", 10)


# ACCUMULATIVE FEATURES

# tar verdi fra +60 siden den viser måling mellom 00 og 60
dataset = add_feature(dataset, features60, "diffuse_rad_1h:J")
# tar verdi fra +60 siden den viser måling mellom 00 og 60
dataset = add_feature(dataset, features60, "direct_rad_1h:J")#!Try without

# PRESSURE

# tar gjennomsnittet da dette er punktmålinger ##kan hende denne burde kjøres per kvarter
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "msl_pressure:hPa")


# TEMPERATURE

# gjennomsnitt siden variasjonen hvert kvarter sannsynligvis er lav? ##kan hende denne burde kjøres per kvarter
# update: kjører hvert kvarter
dataset = add_all(dataset, features00, features15, features30, features45, features60, "t_1000hPa:K")


# SUN

#legger til alle siden har testing har vist at disse er svært viktige
dataset = add_all(dataset, features00, features15, features30, features45, features60, "sun_azimuth:d")
dataset = add_all(dataset, features00, features15, features30, features45, features60, "sun_elevation:d")

#tar verdien fra +60 siden den viser måling mellom 00 og 60
dataset = add_feature(dataset, features60, "clear_sky_energy_1h:J")


# DAY AND SHADOW

#gjennomsnitt fordi jeg vet ikke
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "visibility:m")


# CLOUDS

#gjennomsnitt fordi verdien er trolig momentan
#SANNSYNLIGVIS VELDIG VIKTIG
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "effective_cloud_cover:p")
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "ceiling_height_agl:ft")


# HUMIDITY AND RIME

#tar gjennomsnitt fordi jeg vet ikke #!diskuter
#dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "prob_rime:p")
#tar gjennomsnitt fordi jeg vet ikke #!diskuter
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "relative_humidity_1000hPa:p")


# WIND

# Gjennomsnitt fordi lite variabel #! try without
# dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "wind_speed_u_10m:ms")
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "wind_speed_v_10m:ms")
# dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "wind_speed_10m:ms")


# OTHERS (Up for discussion)
# Gjennomsnitt fordi?
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "super_cooled_liquid_water:kgm2")

display(dataset)

Unnamed: 0,merge_time,pv_measurement,location,id,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2
0,2019-06-02 22:00:00,0.00,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.000000,0.000000,1006.140015,285.899994,286.100006,286.299988,286.600006,286.799988,342.834015,346.294006,349.768005,353.251007,356.742004,-3.202,-3.650,-3.998,-4.247,-4.393,0.000000,40649.164062,99.180000,5655.774414,71.179993,-0.40,0.00
1,2019-06-02 23:00:00,0.00,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.20,0.000000,0.000000,1005.079956,286.799988,286.899994,286.899994,287.000000,287.000000,356.742004,0.235000,3.728000,7.218000,9.741000,-4.393,-4.438,-4.379,-4.219,-3.910,0.000000,31111.119141,99.799995,5529.790039,67.599998,0.36,0.00
2,2019-06-03 00:00:00,0.00,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.000000,0.000000,1004.500000,287.000000,287.000000,286.899994,286.899994,286.899994,9.741000,13.212000,16.671000,20.115000,23.541000,-3.910,-3.575,-3.142,-2.611,-1.986,0.000000,11297.320312,100.000000,5013.976562,68.580002,0.76,0.00
3,2019-06-03 01:00:00,0.00,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.50,7743.299805,0.000000,1003.900024,286.899994,286.799988,286.700012,286.600006,286.500000,23.541000,26.948000,30.334000,33.698002,37.040001,-1.986,-1.269,-0.463,0.428,1.401,6546.899902,2393.800049,100.000000,4095.931641,74.800003,0.90,0.00
4,2019-06-03 02:00:00,19.36,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.22,60137.601562,3158.300049,1003.000000,286.500000,286.500000,286.399994,286.399994,286.399994,37.040001,40.359001,43.657001,46.933998,50.193001,1.401,2.453,3.578,4.773,6.033,102225.898438,14631.379883,79.659996,3177.559082,80.419998,0.92,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94981,2023-07-03 19:00:00,0.00,C,2155,0.0,0.0,0.0,0.0,0.0,0.0,0.00,109878.000000,39874.800781,992.220032,287.000000,286.899994,286.799988,286.700012,286.600006,301.864990,305.072998,308.291992,311.526001,314.777008,10.182,8.778,7.428,6.136,4.908,269582.406250,40833.617188,86.860001,3480.719971,71.999992,-1.04,0.08
94982,2023-07-03 20:00:00,0.00,C,2156,0.0,,,,,0.0,0.00,44498.898438,10678.299805,992.619995,286.600006,286.500000,286.399994,286.299988,286.100006,314.777008,318.046997,321.338013,324.649994,327.984009,4.908,3.747,2.658,1.645,0.711,71999.601562,41705.980469,76.220009,2962.000000,75.959999,-0.94,0.00
94983,2023-07-03 21:00:00,0.00,C,2157,0.0,,,,,0.0,0.00,8968.599609,0.000000,993.000000,286.100006,286.000000,285.899994,285.799988,285.600006,327.984009,331.338989,334.714996,338.110992,341.524994,0.711,-0.139,-0.903,-1.577,-2.157,1378.300049,41136.300781,83.639999,2246.199951,81.060005,-0.92,0.00
94984,2023-07-03 22:00:00,0.00,C,2158,0.0,,,,,0.0,0.00,0.000000,0.000000,993.320007,285.600006,285.500000,285.399994,285.299988,285.200012,341.524994,344.954987,348.398010,351.852997,355.315002,-2.157,-2.643,-3.031,-3.320,-3.508,0.000000,39011.281250,100.000000,2001.479858,83.860001,-0.76,0.08


# Standardize data

In [81]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# make trig transform on solar angles
def apply_trig(dataset, feature_name, suffixes):
    for suffix in suffixes:
        column_name = f"{feature_name}_{suffix}"
        if column_name in dataset:
            dataset[column_name] = dataset[column_name].apply(lambda d: np.cos((d * np.pi) / 180))

# apply trig transform on solar angles
suffixes = ['00', '15', '30', '45', '60']
apply_trig(dataset, 'sun_azimuth:d', suffixes)
apply_trig(dataset, 'sun_elevation:d', suffixes)

# # all features except pv_measurement, merge_time, location and id
# columns_to_exclude = ['merge_time', 'pv_measurement', 'location', 'id']
# # standardize the features
# scaler = StandardScaler()
# scaled_columns = scaler.fit_transform(dataset.drop(columns=columns_to_exclude))
# scaled_dataset = pd.DataFrame(scaled_columns, columns=dataset.drop(columns=columns_to_exclude).columns)

# # add the excluded columns back to the dataset
# dataset = pd.concat([dataset[columns_to_exclude], scaled_dataset], axis=1)


## Trying first without standardization


# def standardize(dataset, feature_name, minus_min=False):
#     if minus_min:
#         dataset[feature_name] = (dataset[feature_name] - dataset[feature_name].min())/dataset[feature_name].std()
#     else:
#         dataset[feature_name] = dataset[feature_name]/dataset[feature_name].std()
    
#     return dataset

# def standardize_all(dataset, feature_name, accumulated=False, minus_min=False):
#     if not accumulated:
#         dataset[feature_name + "_00"] = standardize(dataset, feature_name + "_00")[feature_name + "_00"]

#     dataset[feature_name + "_15"] = standardize(dataset, feature_name + "_15", minus_min=minus_min)[feature_name + "_15"]
#     dataset[feature_name + "_30"] = standardize(dataset, feature_name + "_30", minus_min=minus_min)[feature_name + "_30"]
#     dataset[feature_name + "_45"] = standardize(dataset, feature_name + "_45", minus_min=minus_min)[feature_name + "_45"]
#     dataset[feature_name + "_60"] = standardize(dataset, feature_name + "_60", minus_min=minus_min)[feature_name + "_60"]

#     return dataset

# # standardize the features
# dataset = standardize(dataset, 'fresh_snow_24h:cm')
# dataset = standardize_all(dataset, 'precip_5min:mm')
# #dataset = standardize(dataset, 'snow_depth:cm')
# dataset = standardize(dataset, 'snow_water:kgm2')
# #dataset = standardize(dataset, 'snow_melt_10min:mm')
# dataset = standardize(dataset, 'diffuse_rad_1h:J')
# dataset = standardize(dataset, 'direct_rad_1h:J')
# dataset = standardize(dataset, 'msl_pressure:hPa', minus_min=True)
# dataset = standardize_all(dataset, 't_1000hPa:K', minus_min=True)
# dataset = standardize_all(dataset, 'sun_azimuth:d')
# dataset = standardize_all(dataset, 'sun_elevation:d')
# dataset = standardize(dataset, 'clear_sky_energy_1h:J')
# dataset = standardize(dataset, 'visibility:m')
# dataset = standardize(dataset, 'effective_cloud_cover:p')
# dataset = standardize(dataset, 'ceiling_height_agl:ft')
# # dataset = standardize(dataset, 'prob_rime:p')
# dataset = standardize(dataset, 'relative_humidity_1000hPa:p')
# # dataset = standardize(dataset, 'wind_speed_u_10m:ms')
# dataset = standardize(dataset, 'wind_speed_v_10m:ms')
# # dataset = standardize(dataset, 'wind_speed_10m:ms')
# dataset = standardize(dataset, 'super_cooled_liquid_water:kgm2')

# Standardization
columns_to_standardize = dataset.columns[4:]
scaler = StandardScaler()
dataset[columns_to_standardize] = scaler.fit_transform(dataset[columns_to_standardize])


# # Normalization
# scaler_minmax = MinMaxScaler()
# X_normalized = scaler_minmax.fit_transform(X)

display(dataset)


Unnamed: 0,merge_time,pv_measurement,location,id,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2
0,2019-06-02 22:00:00,0.00,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.382760,-0.666556,-0.454484,-0.270270,0.968462,0.999545,1.030285,1.075858,1.105748,1.389608,1.413021,1.431416,1.444645,1.451836,0.867755,0.862952,0.858795,0.855597,0.853531,-0.636308,0.427435,0.938694,0.914365,-0.170305,-0.568428,-0.542338
1,2019-06-02 23:00:00,0.00,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.467468,-0.666556,-0.454484,-0.351389,1.105788,1.121680,1.121903,1.136921,1.136265,1.451991,1.454389,1.451453,1.443199,1.433236,0.853548,0.852942,0.853782,0.855969,0.859829,-0.636308,-0.107564,0.956771,0.872043,-0.421305,-0.162485,-0.542338
2,2019-06-03 00:00:00,0.00,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,1.568678,-0.666556,-0.454484,-0.395770,1.136307,1.136948,1.121903,1.121654,1.121006,1.433389,1.415937,1.393444,1.366065,1.333254,0.859846,0.863803,0.868397,0.873226,0.877652,-0.636308,-1.218941,0.962603,0.698762,-0.352595,0.051169,-0.542338
3,2019-06-03 01:00:00,0.00,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,1.738095,-0.631140,-0.454484,-0.441682,1.121047,1.106412,1.091367,1.075858,1.059978,1.333402,1.296612,1.255570,1.210507,1.160981,0.877668,0.881356,0.883577,0.883655,0.880766,-0.628462,-1.718349,0.962603,0.390358,0.083501,0.125948,-0.542338
4,2019-06-03 02:00:00,19.36,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.552177,-0.391505,-0.446768,-0.510555,1.060014,1.060612,1.045556,1.045324,1.044720,1.161120,1.108548,1.052679,0.993756,0.931382,0.880782,0.874430,0.863794,0.848145,0.826692,-0.513787,-1.031930,0.369556,0.081844,0.477529,0.136631,-0.542338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94981,2023-07-03 19:00:00,0.00,C,2155,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.164007,-0.357061,-1.335478,1.136307,1.121680,1.106633,1.091125,1.075236,0.768346,0.836249,0.901806,0.964862,1.024589,0.721338,0.763001,0.797295,0.824802,0.846010,-0.313204,0.437782,0.579484,0.183687,-0.112813,-0.910275,0.218049
94982,2023-07-03 20:00:00,0.00,C,2156,-0.186662,,,,,-0.170456,-0.379616,-0.463031,-0.428395,-1.304871,1.075274,1.060612,1.045556,1.030057,0.998949,1.024721,1.081943,1.136037,1.186757,1.233176,0.846027,0.861825,0.872809,0.879689,0.883056,-0.550014,0.486713,0.269257,0.009430,0.164830,-0.856861,-0.542338
94983,2023-07-03 21:00:00,0.00,C,2157,-0.186662,,,,,-0.170456,-0.379616,-0.625536,-0.454484,-1.275792,0.998981,0.984277,0.969208,0.953727,0.922663,1.233319,1.276347,1.315297,1.349938,1.379297,0.883072,0.883857,0.882632,0.880033,0.876538,-0.634656,0.454759,0.485600,-0.231033,0.522401,-0.846179,-0.542338
94984,2023-07-03 22:00:00,0.00,C,2158,-0.186662,,,,,-0.170456,-0.379616,-0.666556,-0.454484,-1.251304,0.922689,0.907942,0.892861,0.877397,0.861634,1.379448,1.404589,1.424835,1.440048,1.449330,0.876554,0.872908,0.869474,0.866617,0.864513,-0.636308,0.335565,0.962603,-0.313243,0.718714,-0.760717,0.218049


# Add categorical features

In [82]:
# reason I'm adding it here is to prevent the massive -666 value getting in the way of the scaling
# add most frequent bc don't wanna have 100 000 features, also don't add priority list and pick the min value if conflict
# dataset = add_most_frequent_feature(dataset, features00, features15, features30, features45, "ceiling_height_agl:ft", priority_list=[], max=False)
#! test this aswell
# dataset = add_all(dataset, features00, features15, features30, features45, features60, "ceiling_height_agl:ft")

# add most frequent bc don't wanna have 100 000 features
dataset = add_most_frequent_feature(dataset, features15, features30, features45, features60, "precip_type_5min:idx", priority_list=[3,4,2], max=True)
dataset['precip_type_5min:idx'] = dataset['precip_type_5min:idx'].astype(int)
#! test this aswell
# dataset = add_all(dataset, features00, features15, features30, features45, features60, "precip_type_5min:idx")

# # categorizing ceiling_height_agl:ft
# # found categories on google
# labels=["VFR1", "LIFR", "IFR", "MVFR", "VFR4", "VFR3", "VFR2"]
# ceiling_height_agl_ft_categories = {
#     -666: "VFR1",
#     1: "LIFR",
#     2: "IFR",
#     3: "MVFR",
#     4: "VFR4",
#     5: "VFR3",
#     6: "VFR2",
# }
# # map the values to their labels
# dataset['ceiling_height_agl:ft'] = dataset['ceiling_height_agl:ft'].map(ceiling_height_agl_ft_categories)

# #categorizing precip_type_5min:idx
# #found this in the docs
# precip_types = {
#     0: "None",
#     1: "Rain",
#     2: "Rain_and_snow_mixed",
#     3: "Snow",
#     4: "Sleet",
#     5: "Freezing_rain",
#     6: "Hail",
# }
# #map the values to their labels
# dataset['precip_type_5min:idx'] = dataset['precip_type_5min:idx'].map(precip_types)
    
display(dataset)
# Add feature from 60 because pretty consistent
dataset = add_feature(dataset, features60, "snow_density:kgm3", categorical=True)

# OHE av kategorisk variabel #!Opp til diskusjon om man skal ta gjennomsnitt eller flere av målingene
#use the 30 value because it is in the middle of the timeframe
#dataset = OHE(dataset, features30, "dew_or_rime:idx")

# tar alle verdiene siden disse nok er ekstremt viktige for modellen og gir ikke mening å standardisere
dataset = add_all(dataset, features00, features15, features30, features45, features60, "is_day:idx", categorical=True)

#mapper location til tall og legger til som kategorisk variabel
location_mapping = {
    "A": 1,
    "B": 2,
    "C": 3
}
dataset["location"] = dataset["location"].map(location_mapping)
dataset = OHE(dataset, features00, "location")
dataset = dataset.drop(columns=["location"])

add_to_categorical_features("location")

#adding feature for summer months
dataset['merge_time'] = pd.to_datetime(dataset['merge_time'])
dataset['is_summer_month'] = dataset['merge_time'].dt.month.isin([5, 6, 7]).astype(int)

# 4 precip_5min:mm values are NaN, set them to 0 because precip_type is 0.0 (None)
# 3 precip_type:idx values are NaN, set them to 0.0 (None)
dataset = dataset.fillna(0.0)

Unnamed: 0,merge_time,pv_measurement,location,id,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx
0,2019-06-02 22:00:00,0.00,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.382760,-0.666556,-0.454484,-0.270270,0.968462,0.999545,1.030285,1.075858,1.105748,1.389608,1.413021,1.431416,1.444645,1.451836,0.867755,0.862952,0.858795,0.855597,0.853531,-0.636308,0.427435,0.938694,0.914365,-0.170305,-0.568428,-0.542338,0
1,2019-06-02 23:00:00,0.00,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.467468,-0.666556,-0.454484,-0.351389,1.105788,1.121680,1.121903,1.136921,1.136265,1.451991,1.454389,1.451453,1.443199,1.433236,0.853548,0.852942,0.853782,0.855969,0.859829,-0.636308,-0.107564,0.956771,0.872043,-0.421305,-0.162485,-0.542338,0
2,2019-06-03 00:00:00,0.00,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,1.568678,-0.666556,-0.454484,-0.395770,1.136307,1.136948,1.121903,1.121654,1.121006,1.433389,1.415937,1.393444,1.366065,1.333254,0.859846,0.863803,0.868397,0.873226,0.877652,-0.636308,-1.218941,0.962603,0.698762,-0.352595,0.051169,-0.542338,0
3,2019-06-03 01:00:00,0.00,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,1.738095,-0.631140,-0.454484,-0.441682,1.121047,1.106412,1.091367,1.075858,1.059978,1.333402,1.296612,1.255570,1.210507,1.160981,0.877668,0.881356,0.883577,0.883655,0.880766,-0.628462,-1.718349,0.962603,0.390358,0.083501,0.125948,-0.542338,0
4,2019-06-03 02:00:00,19.36,A,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.552177,-0.391505,-0.446768,-0.510555,1.060014,1.060612,1.045556,1.045324,1.044720,1.161120,1.108548,1.052679,0.993756,0.931382,0.880782,0.874430,0.863794,0.848145,0.826692,-0.513787,-1.031930,0.369556,0.081844,0.477529,0.136631,-0.542338,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94981,2023-07-03 19:00:00,0.00,C,2155,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.164007,-0.357061,-1.335478,1.136307,1.121680,1.106633,1.091125,1.075236,0.768346,0.836249,0.901806,0.964862,1.024589,0.721338,0.763001,0.797295,0.824802,0.846010,-0.313204,0.437782,0.579484,0.183687,-0.112813,-0.910275,0.218049,0
94982,2023-07-03 20:00:00,0.00,C,2156,-0.186662,,,,,-0.170456,-0.379616,-0.463031,-0.428395,-1.304871,1.075274,1.060612,1.045556,1.030057,0.998949,1.024721,1.081943,1.136037,1.186757,1.233176,0.846027,0.861825,0.872809,0.879689,0.883056,-0.550014,0.486713,0.269257,0.009430,0.164830,-0.856861,-0.542338,0
94983,2023-07-03 21:00:00,0.00,C,2157,-0.186662,,,,,-0.170456,-0.379616,-0.625536,-0.454484,-1.275792,0.998981,0.984277,0.969208,0.953727,0.922663,1.233319,1.276347,1.315297,1.349938,1.379297,0.883072,0.883857,0.882632,0.880033,0.876538,-0.634656,0.454759,0.485600,-0.231033,0.522401,-0.846179,-0.542338,0
94984,2023-07-03 22:00:00,0.00,C,2158,-0.186662,,,,,-0.170456,-0.379616,-0.666556,-0.454484,-1.251304,0.922689,0.907942,0.892861,0.877397,0.861634,1.379448,1.404589,1.424835,1.440048,1.449330,0.876554,0.872908,0.869474,0.866617,0.864513,-0.636308,0.335565,0.962603,-0.313243,0.718714,-0.760717,0.218049,0


In [83]:
display(dataset.head())
print(dataset.drop(columns=['merge_time', 'pv_measurement']).shape)
print(len(numerical_feature_names))
print(len(categorical_feature_names))
print(len(numerical_feature_names) + len(categorical_feature_names))

Unnamed: 0,merge_time,pv_measurement,location,id,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx,snow_density:kgm3,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60,is_summer_month
0,2019-06-02 22:00:00,0.0,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.38276,-0.666556,-0.454484,-0.27027,0.968462,0.999545,1.030285,1.075858,1.105748,1.389608,1.413021,1.431416,1.444645,1.451836,0.867755,0.862952,0.858795,0.855597,0.853531,-0.636308,0.427435,0.938694,0.914365,-0.170305,-0.568428,-0.542338,0,0,0,0,0,0,0,1
1,2019-06-02 23:00:00,0.0,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.467468,-0.666556,-0.454484,-0.351389,1.105788,1.12168,1.121903,1.136921,1.136265,1.451991,1.454389,1.451453,1.443199,1.433236,0.853548,0.852942,0.853782,0.855969,0.859829,-0.636308,-0.107564,0.956771,0.872043,-0.421305,-0.162485,-0.542338,0,0,0,0,0,0,0,1
2,2019-06-03 00:00:00,0.0,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,1.568678,-0.666556,-0.454484,-0.39577,1.136307,1.136948,1.121903,1.121654,1.121006,1.433389,1.415937,1.393444,1.366065,1.333254,0.859846,0.863803,0.868397,0.873226,0.877652,-0.636308,-1.218941,0.962603,0.698762,-0.352595,0.051169,-0.542338,0,0,0,0,0,0,0,1
3,2019-06-03 01:00:00,0.0,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,1.738095,-0.63114,-0.454484,-0.441682,1.121047,1.106412,1.091367,1.075858,1.059978,1.333402,1.296612,1.25557,1.210507,1.160981,0.877668,0.881356,0.883577,0.883655,0.880766,-0.628462,-1.718349,0.962603,0.390358,0.083501,0.125948,-0.542338,0,0,0,0,0,1,1,1
4,2019-06-03 02:00:00,19.36,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.552177,-0.391505,-0.446768,-0.510555,1.060014,1.060612,1.045556,1.045324,1.04472,1.16112,1.108548,1.052679,0.993756,0.931382,0.880782,0.87443,0.863794,0.848145,0.826692,-0.513787,-1.03193,0.369556,0.081844,0.477529,0.136631,-0.542338,0,0,1,1,1,1,1,1


(94986, 42)
32
8
40


In [84]:
dataset.isna().sum()

merge_time                        0
pv_measurement                    0
location                          0
id                                0
fresh_snow_24h:cm                 0
precip_5min:mm_00                 0
precip_5min:mm_15                 0
precip_5min:mm_30                 0
precip_5min:mm_45                 0
precip_5min:mm_60                 0
snow_water:kgm2                   0
diffuse_rad_1h:J                  0
direct_rad_1h:J                   0
msl_pressure:hPa                  0
t_1000hPa:K_00                    0
t_1000hPa:K_15                    0
t_1000hPa:K_30                    0
t_1000hPa:K_45                    0
t_1000hPa:K_60                    0
sun_azimuth:d_00                  0
sun_azimuth:d_15                  0
sun_azimuth:d_30                  0
sun_azimuth:d_45                  0
sun_azimuth:d_60                  0
sun_elevation:d_00                0
sun_elevation:d_15                0
sun_elevation:d_30                0
sun_elevation:d_45          

# Split into training set and test set

In [85]:
testset = dataset[dataset["id"].apply(lambda id: id != -10)]
testset = testset.drop(columns=["id", 'merge_time', 'pv_measurement'])

In [86]:
from sklearn.model_selection import train_test_split


display(dataset)
# extract real dataset
dataset = dataset[dataset["id"].apply(lambda id: id == -10)]
dataset = dataset.drop(columns=["id"], axis=1)

dataset = dataset.sort_values(by="merge_time")

datasetX = dataset.drop(columns=["pv_measurement", "merge_time"], axis=1)
datasetY = dataset[['pv_measurement', 'is_summer_month']]

display(datasetX)
display(datasetY)
display(testset)

# # randomly split dataset into train and eval
# trainset, evalset = train_test_split(dataset, test_size=0.2, random_state=42)

# # split into X and Y
# trainsetX = trainset.drop(columns="pv_measurement", axis=1)
# trainsetY = trainset["pv_measurement"]
# evalsetX = evalset.drop(columns="pv_measurement", axis=1)
# evalsetY = evalset["pv_measurement"]

# display(trainsetX)
# display(evalsetX)
# display(trainsetY)
# display(evalsetY)



Unnamed: 0,merge_time,pv_measurement,location,id,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx,snow_density:kgm3,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60,is_summer_month
0,2019-06-02 22:00:00,0.00,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.382760,-0.666556,-0.454484,-0.270270,0.968462,0.999545,1.030285,1.075858,1.105748,1.389608,1.413021,1.431416,1.444645,1.451836,0.867755,0.862952,0.858795,0.855597,0.853531,-0.636308,0.427435,0.938694,0.914365,-0.170305,-0.568428,-0.542338,0,0,0,0,0,0,0,1
1,2019-06-02 23:00:00,0.00,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.467468,-0.666556,-0.454484,-0.351389,1.105788,1.121680,1.121903,1.136921,1.136265,1.451991,1.454389,1.451453,1.443199,1.433236,0.853548,0.852942,0.853782,0.855969,0.859829,-0.636308,-0.107564,0.956771,0.872043,-0.421305,-0.162485,-0.542338,0,0,0,0,0,0,0,1
2,2019-06-03 00:00:00,0.00,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,1.568678,-0.666556,-0.454484,-0.395770,1.136307,1.136948,1.121903,1.121654,1.121006,1.433389,1.415937,1.393444,1.366065,1.333254,0.859846,0.863803,0.868397,0.873226,0.877652,-0.636308,-1.218941,0.962603,0.698762,-0.352595,0.051169,-0.542338,0,0,0,0,0,0,0,1
3,2019-06-03 01:00:00,0.00,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,1.738095,-0.631140,-0.454484,-0.441682,1.121047,1.106412,1.091367,1.075858,1.059978,1.333402,1.296612,1.255570,1.210507,1.160981,0.877668,0.881356,0.883577,0.883655,0.880766,-0.628462,-1.718349,0.962603,0.390358,0.083501,0.125948,-0.542338,0,0,0,0,0,1,1,1
4,2019-06-03 02:00:00,19.36,1,-10,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.552177,-0.391505,-0.446768,-0.510555,1.060014,1.060612,1.045556,1.045324,1.044720,1.161120,1.108548,1.052679,0.993756,0.931382,0.880782,0.874430,0.863794,0.848145,0.826692,-0.513787,-1.031930,0.369556,0.081844,0.477529,0.136631,-0.542338,0,0,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94981,2023-07-03 19:00:00,0.00,3,2155,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.164007,-0.357061,-1.335478,1.136307,1.121680,1.106633,1.091125,1.075236,0.768346,0.836249,0.901806,0.964862,1.024589,0.721338,0.763001,0.797295,0.824802,0.846010,-0.313204,0.437782,0.579484,0.183687,-0.112813,-0.910275,0.218049,0,0,1,1,1,1,1,1
94982,2023-07-03 20:00:00,0.00,3,2156,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.463031,-0.428395,-1.304871,1.075274,1.060612,1.045556,1.030057,0.998949,1.024721,1.081943,1.136037,1.186757,1.233176,0.846027,0.861825,0.872809,0.879689,0.883056,-0.550014,0.486713,0.269257,0.009430,0.164830,-0.856861,-0.542338,0,0,1,1,1,1,1,1
94983,2023-07-03 21:00:00,0.00,3,2157,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.625536,-0.454484,-1.275792,0.998981,0.984277,0.969208,0.953727,0.922663,1.233319,1.276347,1.315297,1.349938,1.379297,0.883072,0.883857,0.882632,0.880033,0.876538,-0.634656,0.454759,0.485600,-0.231033,0.522401,-0.846179,-0.542338,0,0,1,0,0,0,0,1
94984,2023-07-03 22:00:00,0.00,3,2158,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.666556,-0.454484,-1.251304,0.922689,0.907942,0.892861,0.877397,0.861634,1.379448,1.404589,1.424835,1.440048,1.449330,0.876554,0.872908,0.869474,0.866617,0.864513,-0.636308,0.335565,0.962603,-0.313243,0.718714,-0.760717,0.218049,0,0,0,0,0,0,0,1


Unnamed: 0,location,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx,snow_density:kgm3,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60,is_summer_month
34059,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,1.322125,-0.125490,-0.666556,-0.454484,-1.745648,-0.175926,-0.191290,-0.206542,-0.221752,-0.236898,1.417181,1.381654,1.335617,1.280196,1.215952,-2.684689,-2.623520,-2.543428,-2.446129,-2.332063,-0.636308,-0.954170,0.527585,0.328219,1.083295,1.066026,-0.542338,1,0,0,0,0,0,0,0
34060,2,-0.186662,-0.170501,1.626018,1.679790,1.625946,-0.170456,0.043926,-0.666556,-0.454484,-1.803807,-0.236959,-0.221822,-0.221812,-0.221752,-0.206381,1.216094,1.145500,1.069340,0.988760,0.904230,-2.331952,-2.205003,-2.066631,-1.918751,-1.762539,-0.636308,-1.135927,0.882130,0.251730,1.027206,1.365142,-0.542338,0,0,0,0,0,0,0,0
34061,2,-0.186662,1.619840,-0.168937,-0.173587,-0.174353,-0.170456,0.043926,-0.666556,-0.454484,-1.812989,-0.206440,-0.221822,-0.237083,-0.252285,-0.267414,0.904355,0.817801,0.729607,0.640405,0.550273,-1.762446,-1.601547,-1.437231,-1.271386,-1.104887,-0.636308,-1.258482,0.862887,0.173940,0.952887,1.493334,0.598242,0,0,0,0,0,0,0,0
34062,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,1.620642,0.043926,-0.666556,-0.454484,-1.788501,-0.267478,-0.282889,-0.313430,-0.328615,-0.358955,0.550379,0.460569,0.371069,0.282173,0.193736,-1.104812,-0.940452,-0.779228,-0.622238,-0.470318,-0.636308,-1.501494,0.872800,0.141581,1.073480,1.482652,1.358629,1,0,0,0,0,0,0,0
34063,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.043926,-0.666556,-0.454484,-1.745643,-0.359025,-0.313425,-0.267623,-0.221752,-0.175869,0.193823,0.106591,0.020487,-0.064438,-0.148280,-0.470263,-0.324940,-0.186923,-0.056700,0.065399,-0.636308,-1.511920,0.858805,0.155424,1.244553,0.628035,0.978436,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34057,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.210199,-0.666556,-0.454484,0.360284,-0.786266,-0.786706,-0.802057,-0.817125,-0.816675,1.382046,1.408272,1.428863,1.443634,1.451678,0.706763,0.690744,0.677778,0.668324,0.662649,-0.636308,-1.105985,0.910704,-0.551059,0.391992,0.991247,0.408146,0,0,0,0,0,0,0,0
92824,3,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.040782,-0.666556,-0.454484,0.367935,-0.801525,-0.817237,-0.817322,-0.817125,-0.831934,1.382006,1.408241,1.428839,1.443622,1.451672,0.705564,0.689459,0.676414,0.666930,0.661200,-0.636308,-1.209618,0.924699,-0.591922,0.420037,0.863055,0.788339,0,0,0,0,0,0,0,0
34058,1,-0.186662,-0.170501,1.326859,1.370894,1.325897,-0.170456,-0.210199,-0.666556,-0.454484,0.332737,-0.816780,-0.817237,-0.817322,-0.801862,-0.877709,1.451833,1.454373,1.450863,1.441331,1.429761,0.662671,0.661043,0.663424,0.669823,0.687825,-0.636308,-0.980966,0.813321,-0.487150,0.443876,1.098074,0.027952,0,0,0,0,0,0,0,0
66755,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.210199,-0.666556,-0.454484,0.337324,-0.816780,-0.817237,-0.802057,-0.801862,-0.877709,1.451834,1.454373,1.450863,1.441331,1.429756,0.662671,0.661043,0.663461,0.669823,0.687860,-0.636308,-0.962276,0.813321,-0.486801,0.443876,1.108757,0.027952,0,0,0,0,0,0,0,0


Unnamed: 0,pv_measurement,is_summer_month
34059,0.0,0
34060,0.0,0
34061,0.0,0
34062,0.0,0
34063,0.0,0
...,...,...
34057,0.0,0
92824,-0.0,0
34058,0.0,0
66755,-0.0,0


Unnamed: 0,location,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx,snow_density:kgm3,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60,is_summer_month
92826,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.666556,-0.454484,0.300594,-0.877818,-0.878309,-0.878404,-0.878192,-0.877709,1.429914,1.409981,1.384423,1.353455,1.316595,0.687847,0.700234,0.715641,0.733491,0.753158,-0.636308,-0.179563,0.171291,-0.785071,0.466311,1.429238,-0.542338,0,0,0,0,0,0,0,1
92827,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.666556,-0.454484,0.262332,-0.877818,-0.878309,-0.878404,-0.878192,-0.877709,1.316742,1.275519,1.229720,1.179671,1.125001,0.753177,0.774063,0.795304,0.816129,0.835540,-0.636308,-0.197491,0.404544,-0.632046,0.427048,1.397190,-0.542338,0,0,0,0,0,0,0,1
92828,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.666556,-0.454484,0.225602,-0.877818,-0.878309,-0.863134,-0.862925,-0.862450,1.125138,1.067291,1.006141,0.941941,0.874427,0.835558,0.852900,0.867190,0.877573,0.883081,-0.636308,-0.082794,0.539248,-0.349222,0.279814,1.354459,-0.542338,0,0,0,0,0,0,0,1
92829,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.407802,-0.406154,0.185808,-0.862558,-0.863041,-0.863134,-0.862925,-0.862450,0.874550,0.804936,0.733072,0.659156,0.582931,0.883097,0.883175,0.876981,0.863832,0.843041,-0.564666,0.100893,-0.011230,-0.207941,0.132579,1.322411,-0.542338,0,0,0,1,1,1,1,1
92830,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,0.203921,-0.149778,0.153670,-0.862558,-0.863041,-0.863134,-0.847658,-0.847192,0.583039,0.505479,0.426422,0.345980,0.263961,0.843058,0.814304,0.777124,0.731122,0.676127,-0.251572,0.129119,0.054081,-0.261106,0.024606,1.268998,-0.542338,0,0,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94981,3,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.164007,-0.357061,-1.335478,1.136307,1.121680,1.106633,1.091125,1.075236,0.768346,0.836249,0.901806,0.964862,1.024589,0.721338,0.763001,0.797295,0.824802,0.846010,-0.313204,0.437782,0.579484,0.183687,-0.112813,-0.910275,0.218049,0,0,1,1,1,1,1,1
94982,3,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.463031,-0.428395,-1.304871,1.075274,1.060612,1.045556,1.030057,0.998949,1.024721,1.081943,1.136037,1.186757,1.233176,0.846027,0.861825,0.872809,0.879689,0.883056,-0.550014,0.486713,0.269257,0.009430,0.164830,-0.856861,-0.542338,0,0,1,1,1,1,1,1
94983,3,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.625536,-0.454484,-1.275792,0.998981,0.984277,0.969208,0.953727,0.922663,1.233319,1.276347,1.315297,1.349938,1.379297,0.883072,0.883857,0.882632,0.880033,0.876538,-0.634656,0.454759,0.485600,-0.231033,0.522401,-0.846179,-0.542338,0,0,1,0,0,0,0,1
94984,3,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.666556,-0.454484,-1.251304,0.922689,0.907942,0.892861,0.877397,0.861634,1.379448,1.404589,1.424835,1.440048,1.449330,0.876554,0.872908,0.869474,0.866617,0.864513,-0.636308,0.335565,0.962603,-0.313243,0.718714,-0.760717,0.218049,0,0,0,0,0,0,0,1


# Making model

In [87]:
#!ReWrite
def evaluate_models(models, X, Y):
    preds = X.iloc[:,1:2]

    for i in range(len(models)):
        preds[str(i)] = models[i].predict(X)


    preds = preds.iloc[:,1:]


    preds["final"] = preds.mean(axis=1)
    preds["losses"] = (preds["final"] - Y).apply(lambda a : np.abs(a))
    return preds["losses"].mean()

In [88]:
def get_predictions(models, X):
    preds = X.iloc[:,1:2]

    for i in range(len(models)):
        preds[str(i)] = models[i].predict(X, verbose=0)


    preds = preds.iloc[:,1:]
    return preds.mean(axis=1)

In [89]:
#partition into training and evalset
trainsetX = datasetX
trainsetY = datasetY

trainsetX_summer = trainsetX[trainsetX["is_summer_month"] == 1].drop(columns=["is_summer_month"])
trainsetY_summer = trainsetY[trainsetY["is_summer_month"] == 1].drop(columns=["is_summer_month"])
trainsetX_rest = trainsetX[trainsetX["is_summer_month"] != 1].drop(columns=["is_summer_month"])
trainsetY_rest = trainsetY[trainsetY["is_summer_month"] != 1].drop(columns=["is_summer_month"])

evalsetX_summer = trainsetX_summer.sample(frac=0.05, random_state=42)
evalsetY_summer = trainsetY_summer.sample(frac=0.05, random_state=42)
evalsetX = datasetX.sample(frac=0.05, random_state=42).drop(columns=["is_summer_month"])
evalsetY = datasetY.sample(frac=0.05, random_state=42).drop(columns=["is_summer_month"])

display(evalsetX)
display(evalsetY)

Unnamed: 0,location,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx,snow_density:kgm3,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60
7364,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.625479,-0.454484,0.820958,-1.198243,-1.198914,-1.214331,-1.214041,-1.213368,0.296830,0.379858,0.461956,0.542946,0.622152,0.883229,0.882318,0.873223,0.856405,0.832338,-0.634778,0.197234,-0.194917,0.309246,-0.239014,-0.450918,-0.542338,0,0,1,0,0,0,0
88562,3,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.117702,-0.227465,0.415379,-0.526869,-0.511896,-0.496666,-0.481275,-0.465758,-0.567296,-0.645702,-0.722427,-0.797207,-0.869725,0.872797,0.856381,0.833449,0.804740,0.771022,-0.405960,0.615031,-0.363443,-0.043110,0.250366,1.279680,-0.542338,0,0,1,1,1,1,1
4478,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.043926,-0.567237,-0.453404,-2.071639,-0.282732,-0.282889,-0.282889,-0.282815,-0.267414,-1.421018,-1.400472,-1.374954,-1.344642,-1.309505,0.864198,0.868353,0.872859,0.877246,0.880840,-0.551707,-1.321145,0.653542,0.209672,0.327490,1.781768,1.168532,0,0,1,1,1,1,1
39387,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.666556,-0.454484,-0.551881,1.212599,1.213283,1.198251,1.197984,1.182035,1.146120,1.089237,1.028876,0.965356,0.898377,0.806437,0.828324,0.847789,0.863900,0.875681,-0.636308,0.688054,-0.205997,0.640018,-0.344182,-0.087706,-0.542338,1,0,0,0,0,0,0
70420,3,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.666556,-0.454484,-1.012545,-1.244021,-1.259987,-1.260142,-1.275104,-1.274396,0.684264,0.597841,0.510752,0.423386,0.335605,-0.608601,-0.470772,-0.335290,-0.203475,-0.076251,-0.636308,0.565730,-0.242151,1.002229,0.377970,0.061852,-0.542338,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2535,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,1.029695,1.358486,-0.481476,0.571740,0.587332,0.602741,0.617878,0.617515,-1.255470,-1.200781,-1.141018,-1.076767,-1.008459,-0.184244,-0.113995,-0.038731,0.040203,0.121730,1.072021,1.168236,-0.407178,1.396182,-1.726788,-0.461601,-0.542338,0,0,1,1,1,1,1
64041,2,-0.186662,3.410182,-0.168937,-0.173587,-0.174353,1.919158,0.382760,-0.666556,-0.454484,0.836259,-0.694714,-0.710370,-0.725709,-0.740795,-0.770905,-1.130250,-1.075804,-1.018179,-0.957569,-0.894133,0.877178,0.868244,0.854602,0.835533,0.810289,-0.636308,-1.390178,0.962020,-0.778279,1.100123,0.937834,1.358629,1,0,0,0,0,0,0
74725,3,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,1.333857,3.783847,0.214888,2.280694,2.312515,2.328194,2.358201,2.387370,-1.418972,-1.387305,-1.346013,-1.295981,-1.237968,-2.027855,-1.974672,-1.905100,-1.820261,-1.720768,2.515715,1.049983,-0.897593,0.627014,-0.628836,-1.380314,-0.542338,0,0,1,1,1,1,1
59010,2,-0.186662,-0.170501,-0.168937,-0.173587,12.427740,-0.170456,-0.210199,-0.666556,-0.454484,-0.669721,0.098730,0.098788,0.098849,0.098835,0.098766,0.519075,0.431329,0.343602,0.256144,0.168823,-0.441155,-0.304383,-0.172110,-0.045461,0.074933,-0.636308,1.356883,0.273339,1.066889,-0.983602,1.856547,-0.542338,0,0,0,0,0,0,0


Unnamed: 0,pv_measurement
7364,2.20
88562,0.00
4478,39.16
39387,0.00
70420,0.00
...,...
2535,2833.38
64041,-0.00
74725,735.00
59010,0.00


In [90]:
tf.keras.utils.set_random_seed(42) #seed for reproducability

numModels = 10

models = []

i = 0
while(i < numModels):

  models.append(tf.keras.models.Sequential([
        #tf.keras.layers.GaussianNoise(stddev=0.1, seed=42),
        tf.keras.layers.Dense(130, activation="tanh",
          kernel_initializer=tf.keras.initializers.RandomUniform(-1, 1),
                              bias_initializer=tf.keras.initializers.Zeros()),
        tf.keras.layers.Dense(130, activation="relu",
          kernel_initializer=tf.keras.initializers.GlorotNormal()),
        tf.keras.layers.Dense(1, activation="relu",
          kernel_initializer=tf.keras.initializers.GlorotNormal()),
    ]))

  models[i].compile(
      optimizer=tf.keras.optimizers.legacy.Adadelta(learning_rate=1),
      loss="mean_absolute_error"
  )
  
  models[i].fit(
                      x = trainsetX_rest,
                      y = trainsetY_rest,
                      batch_size = 1000,
                      epochs = 1,
                      verbose = 0,
                      # validation_data = [evalsetX, evalsetY]
                  )
  models[i].fit(
                    x = trainsetX_summer,
                    y = trainsetY_summer,
                    batch_size = 1000,
                    epochs = 1,
                    verbose = 0,
                    # validation_data = [evalsetX, evalsetY]
                )

  #retry if model is nonsensical
  loss = models[i].evaluate(evalsetX, evalsetY, verbose=0)

  # print("Loss:", loss)
  # print("Mean + 4:", evalsetY.mean() + 4)
  # print("Mean - 4:", evalsetY.mean() - 4) 
  
  if((loss < evalsetY.mean() + 4).any() and (loss > evalsetY.mean() - 4).any()):
    print("discarding")
    del models[i]
    continue
  models[i].fit(
                      x = trainsetX_rest,
                      y = trainsetY_rest,
                      batch_size = 1000,
                      epochs = 80,
                      verbose = 0,
                      validation_data = [evalsetX, evalsetY]
                  )
  models[i].fit(
                    x = trainsetX_summer,
                    y = trainsetY_summer,
                    batch_size = 1000,
                    epochs = 80,
                    verbose = 0,
                    validation_data = [evalsetX, evalsetY]
                  )
                  
  loss = models[i].evaluate(evalsetX, evalsetY)
  print(i, " complete")
  i+=1

summer_months = dataset[dataset['is_summer_month'] == 1]
summer_monthsX = summer_months.drop(columns=["pv_measurement", "merge_time", 'is_summer_month'], axis=1)
summer_monthsY = summer_months['pv_measurement']

evaluation_summer = evaluate_models(models, summer_monthsX, summer_monthsY)
print(evaluation_summer)

0  complete
1  complete
2  complete
3  complete
4  complete
5  complete
6  complete
7  complete
8  complete
9  complete
148.16246932613524


# Prediction

In [73]:
preds = get_predictions(models, evalsetX)
preds

50493      0.000000
63811      0.157391
45257      0.000000
76501    104.997360
43051      0.000000
            ...    
40124      0.000000
56580     35.212097
68276      0.000000
3664      28.444584
29353      0.000000
Length: 4641, dtype: float32

# Evaluation

In [100]:
display(datasetX)
display(datasetY)

Unnamed: 0,location,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx,snow_density:kgm3,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60,is_summer_month
34059,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,1.322125,-0.125490,-0.666556,-0.454484,-1.745648,-0.175926,-0.191290,-0.206542,-0.221752,-0.236898,1.417181,1.381654,1.335617,1.280196,1.215952,-2.684689,-2.623520,-2.543428,-2.446129,-2.332063,-0.636308,-0.954170,0.527585,0.328219,1.083295,1.066026,-0.542338,1,0,0,0,0,0,0,0
34060,2,-0.186662,-0.170501,1.626018,1.679790,1.625946,-0.170456,0.043926,-0.666556,-0.454484,-1.803807,-0.236959,-0.221822,-0.221812,-0.221752,-0.206381,1.216094,1.145500,1.069340,0.988760,0.904230,-2.331952,-2.205003,-2.066631,-1.918751,-1.762539,-0.636308,-1.135927,0.882130,0.251730,1.027206,1.365142,-0.542338,0,0,0,0,0,0,0,0
34061,2,-0.186662,1.619840,-0.168937,-0.173587,-0.174353,-0.170456,0.043926,-0.666556,-0.454484,-1.812989,-0.206440,-0.221822,-0.237083,-0.252285,-0.267414,0.904355,0.817801,0.729607,0.640405,0.550273,-1.762446,-1.601547,-1.437231,-1.271386,-1.104887,-0.636308,-1.258482,0.862887,0.173940,0.952887,1.493334,0.598242,0,0,0,0,0,0,0,0
34062,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,1.620642,0.043926,-0.666556,-0.454484,-1.788501,-0.267478,-0.282889,-0.313430,-0.328615,-0.358955,0.550379,0.460569,0.371069,0.282173,0.193736,-1.104812,-0.940452,-0.779228,-0.622238,-0.470318,-0.636308,-1.501494,0.872800,0.141581,1.073480,1.482652,1.358629,1,0,0,0,0,0,0,0
34063,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.043926,-0.666556,-0.454484,-1.745643,-0.359025,-0.313425,-0.267623,-0.221752,-0.175869,0.193823,0.106591,0.020487,-0.064438,-0.148280,-0.470263,-0.324940,-0.186923,-0.056700,0.065399,-0.636308,-1.511920,0.858805,0.155424,1.244553,0.628035,0.978436,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34057,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.210199,-0.666556,-0.454484,0.360284,-0.786266,-0.786706,-0.802057,-0.817125,-0.816675,1.382046,1.408272,1.428863,1.443634,1.451678,0.706763,0.690744,0.677778,0.668324,0.662649,-0.636308,-1.105985,0.910704,-0.551059,0.391992,0.991247,0.408146,0,0,0,0,0,0,0,0
92824,3,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.040782,-0.666556,-0.454484,0.367935,-0.801525,-0.817237,-0.817322,-0.817125,-0.831934,1.382006,1.408241,1.428839,1.443622,1.451672,0.705564,0.689459,0.676414,0.666930,0.661200,-0.636308,-1.209618,0.924699,-0.591922,0.420037,0.863055,0.788339,0,0,0,0,0,0,0,0
34058,1,-0.186662,-0.170501,1.326859,1.370894,1.325897,-0.170456,-0.210199,-0.666556,-0.454484,0.332737,-0.816780,-0.817237,-0.817322,-0.801862,-0.877709,1.451833,1.454373,1.450863,1.441331,1.429761,0.662671,0.661043,0.663424,0.669823,0.687825,-0.636308,-0.980966,0.813321,-0.487150,0.443876,1.098074,0.027952,0,0,0,0,0,0,0,0
66755,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.210199,-0.666556,-0.454484,0.337324,-0.816780,-0.817237,-0.802057,-0.801862,-0.877709,1.451834,1.454373,1.450863,1.441331,1.429756,0.662671,0.661043,0.663461,0.669823,0.687860,-0.636308,-0.962276,0.813321,-0.486801,0.443876,1.108757,0.027952,0,0,0,0,0,0,0,0


Unnamed: 0,pv_measurement,is_summer_month
34059,0.0,0
34060,0.0,0
34061,0.0,0
34062,0.0,0
34063,0.0,0
...,...,...
34057,0.0,0
92824,-0.0,0
34058,0.0,0
66755,-0.0,0


In [101]:
summer_months = dataset[dataset['is_summer_month'] == 1]
summer_monthsX = summer_months.drop(columns=["pv_measurement", "merge_time", 'is_summer_month'], axis=1)
summer_monthsY = summer_months['pv_measurement']

evaluation_summer = evaluate_models(models, summer_monthsX, summer_monthsY)
print(evaluation_summer)

evaluation_total = evaluate_models(models, datasetX.drop(columns=['is_summer_month'], axis=1), datasetY.drop(['is_summer_month'], axis=1))
print(evaluation_total)

148.16246932613524


: 

# Predict test data

In [92]:
display(summer_monthsX)
display(testset)

Unnamed: 0,location,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx,snow_density:kgm3,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60
36901,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,1.919158,0.975719,-0.666556,-0.454484,0.421507,-0.160666,-0.160754,-0.145465,-0.145422,-0.130094,1.429770,1.409788,1.384182,1.353167,1.316261,0.687637,0.700065,0.715479,0.733369,0.753072,-0.636308,-1.748978,0.962603,-0.161842,1.121156,-0.846179,2.879402,1,0,0,0,0,0,0
36902,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.382760,-0.666556,-0.454484,0.354161,-0.130147,-0.130218,-0.145465,-0.145422,-0.160611,1.316408,1.275141,1.229300,1.179211,1.124502,0.753091,0.773984,0.795280,0.816108,0.835540,-0.636308,-1.696730,0.962603,0.007625,1.136581,-0.835496,3.639789,0,0,0,0,0,0,0
36903,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.128635,-0.666556,-0.454484,0.294475,-0.160666,-0.160754,-0.160735,-0.175955,-0.175869,1.124639,1.066757,1.005573,0.941360,0.873798,0.835558,0.852914,0.867200,0.877585,0.883085,-0.636308,-1.648416,0.962603,0.027971,1.123960,-1.017102,4.400176,0,0,0,0,0,0,0
36904,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.467468,-0.600250,-0.454201,0.239376,-0.175926,-0.176022,-0.176005,-0.175955,-0.160611,0.873922,0.804281,0.732393,0.658477,0.582210,0.883101,0.883169,0.876962,0.863798,0.842977,-0.564555,-1.661819,0.962603,0.107458,1.090306,-1.102564,2.689306,0,0,0,1,1,1,1
36905,2,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,0.467468,-0.364986,-0.451000,0.176635,-0.160666,-0.160754,-0.160735,-0.145422,-0.145353,0.582318,0.504742,0.425670,0.345216,0.263188,0.842994,0.814221,0.776994,0.730999,0.675947,-0.251263,-1.671098,0.962603,0.364326,1.062262,-1.123929,1.358629,0,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27717,1,-0.186662,1.023060,0.429381,2.915374,0.425747,2.814707,0.128635,-0.508407,-0.454378,0.274583,0.800618,0.801070,0.785977,0.770538,0.754831,0.710753,0.782187,0.851423,0.918295,0.981941,0.822449,0.847525,0.865531,0.877109,0.882815,-0.518786,-1.344136,0.962603,-0.482272,0.812664,-0.824813,0.408146,1,0,1,1,1,1,1
27718,1,-0.186662,2.813401,0.728540,1.061998,0.725797,0.128060,1.145136,-0.655556,-0.454484,0.266923,0.754844,0.740003,0.724895,0.709475,0.693802,0.982070,1.043252,1.101327,1.156042,1.206406,0.882830,0.883642,0.880234,0.873390,0.863813,-0.634416,-1.545456,0.960853,-0.391234,1.072078,-1.113246,0.598242,1,0,1,0,0,0,0
27719,1,-0.186662,0.127889,2.822655,2.297582,4.326395,4.904321,0.891011,-0.666556,-0.454484,0.242440,0.693811,0.678935,0.663818,0.663674,0.648027,1.206547,1.253440,1.296108,1.334293,1.366971,0.863829,0.852523,0.840183,0.827524,0.815124,-0.636308,-1.563675,0.960853,-0.373671,1.212301,-1.220073,1.548726,1,0,0,0,0,0,0
27720,1,-0.186662,4.902133,2.822655,2.915374,0.425747,0.128060,0.552177,-0.666556,-0.454484,0.239376,0.648033,0.648399,0.648547,0.648408,0.648027,1.367121,1.395306,1.418290,1.435878,1.447158,0.815141,0.803854,0.794098,0.786318,0.780815,-0.636308,-1.567587,0.962603,-0.421958,1.285217,-1.070515,2.119016,1,0,0,0,0,0,0


Unnamed: 0,location,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,ceiling_height_agl:ft,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,precip_type_5min:idx,snow_density:kgm3,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60,is_summer_month
92826,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.666556,-0.454484,0.300594,-0.877818,-0.878309,-0.878404,-0.878192,-0.877709,1.429914,1.409981,1.384423,1.353455,1.316595,0.687847,0.700234,0.715641,0.733491,0.753158,-0.636308,-0.179563,0.171291,-0.785071,0.466311,1.429238,-0.542338,0,0,0,0,0,0,0,1
92827,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.666556,-0.454484,0.262332,-0.877818,-0.878309,-0.878404,-0.878192,-0.877709,1.316742,1.275519,1.229720,1.179671,1.125001,0.753177,0.774063,0.795304,0.816129,0.835540,-0.636308,-0.197491,0.404544,-0.632046,0.427048,1.397190,-0.542338,0,0,0,0,0,0,0,1
92828,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.666556,-0.454484,0.225602,-0.877818,-0.878309,-0.863134,-0.862925,-0.862450,1.125138,1.067291,1.006141,0.941941,0.874427,0.835558,0.852900,0.867190,0.877573,0.883081,-0.636308,-0.082794,0.539248,-0.349222,0.279814,1.354459,-0.542338,0,0,0,0,0,0,0,1
92829,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.407802,-0.406154,0.185808,-0.862558,-0.863041,-0.863134,-0.862925,-0.862450,0.874550,0.804936,0.733072,0.659156,0.582931,0.883097,0.883175,0.876981,0.863832,0.843041,-0.564666,0.100893,-0.011230,-0.207941,0.132579,1.322411,-0.542338,0,0,0,1,1,1,1,1
92830,1,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,0.203921,-0.149778,0.153670,-0.862558,-0.863041,-0.863134,-0.847658,-0.847192,0.583039,0.505479,0.426422,0.345980,0.263961,0.843058,0.814304,0.777124,0.731122,0.676127,-0.251572,0.129119,0.054081,-0.261106,0.024606,1.268998,-0.542338,0,0,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94981,3,-0.186662,-0.170501,-0.168937,-0.173587,-0.174353,-0.170456,-0.379616,-0.164007,-0.357061,-1.335478,1.136307,1.121680,1.106633,1.091125,1.075236,0.768346,0.836249,0.901806,0.964862,1.024589,0.721338,0.763001,0.797295,0.824802,0.846010,-0.313204,0.437782,0.579484,0.183687,-0.112813,-0.910275,0.218049,0,0,1,1,1,1,1,1
94982,3,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.463031,-0.428395,-1.304871,1.075274,1.060612,1.045556,1.030057,0.998949,1.024721,1.081943,1.136037,1.186757,1.233176,0.846027,0.861825,0.872809,0.879689,0.883056,-0.550014,0.486713,0.269257,0.009430,0.164830,-0.856861,-0.542338,0,0,1,1,1,1,1,1
94983,3,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.625536,-0.454484,-1.275792,0.998981,0.984277,0.969208,0.953727,0.922663,1.233319,1.276347,1.315297,1.349938,1.379297,0.883072,0.883857,0.882632,0.880033,0.876538,-0.634656,0.454759,0.485600,-0.231033,0.522401,-0.846179,-0.542338,0,0,1,0,0,0,0,1
94984,3,-0.186662,0.000000,0.000000,0.000000,0.000000,-0.170456,-0.379616,-0.666556,-0.454484,-1.251304,0.922689,0.907942,0.892861,0.877397,0.861634,1.379448,1.404589,1.424835,1.440048,1.449330,0.876554,0.872908,0.869474,0.866617,0.864513,-0.636308,0.335565,0.962603,-0.313243,0.718714,-0.760717,0.218049,0,0,0,0,0,0,0,1


In [96]:
#testset = testset.drop(columns=["is_summer_month"])
preds = get_predictions(models, testset)
preds = preds.reset_index()
preds = preds.drop(columns=["index"])
preds = preds.rename(columns={"pv_measurement": "prediction"})
preds = preds.reset_index()
preds = preds.rename(columns={"index": "id"})
preds

Unnamed: 0,id,0
0,0,0.000000
1,1,0.000000
2,2,0.000000
3,3,42.520947
4,4,350.143982
...,...,...
2155,2155,35.684811
2156,2156,10.930169
2157,2157,0.000000
2158,2158,0.000000


In [98]:
preds = preds.rename(columns={0: "prediction"})
preds

Unnamed: 0,id,prediction
0,0,0.000000
1,1,0.000000
2,2,0.000000
3,3,42.520947
4,4,350.143982
...,...,...
2155,2155,35.684811
2156,2156,10.930169
2157,2157,0.000000
2158,2158,0.000000


In [99]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission = sample_submission[['id']].merge(preds[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('Ensamble3_double_training.csv', index=False)