# Import Libraries

In [65]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import random
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import autogluon

%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

# Read Dataset

In [66]:
#Read datasets
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')
X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')
X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')
#Read test datasets
X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

# Find intervals with broken data in B

In [67]:
# Calculate the difference between consecutive measurements
train_b['diff'] = train_b['pv_measurement'].diff()

# Identify intervals where the difference is zero and the measurement is not 0.0
constant_intervals_B = train_b[(train_b['diff'] == 0) & (train_b['pv_measurement'] != 0) & (train_b['pv_measurement'].notna())]

# Group these intervals and count their lengths
grouped_intervals_B = (constant_intervals_B
                       .assign(group=(constant_intervals_B['time'].diff() != pd.Timedelta(hours=1)).cumsum())
                       .groupby('group')
                       .agg(start_time=('time', 'first'), end_time=('time', 'last'), count=('time', 'count'), constant_value=('pv_measurement', 'first'))
                       .reset_index(drop=True))

# Filter out short intervals (e.g., less than 3 hours) as they might not be considered "broken"
long_constant_intervals_B = grouped_intervals_B[grouped_intervals_B['count'] >= 3]

#long_constant_intervals_B.head(200)  # Display the first few rows of long constant intervals in B


In [68]:
def replace_with_values_from_C(train_b, train_c, constant_intervals):
    train_b_copy = train_b.copy()
    """
    Replace values in dataset B's constant intervals with corresponding values from dataset C.

    :param train_b: DataFrame for dataset B.
    :param train_c: DataFrame for dataset C.
    :param constant_intervals: DataFrame with constant intervals in B to be replaced.
    :return: Modified dataset B with values replaced from C.
    """
    for index, row in constant_intervals.iterrows():
        start_time, end_time = row['start_time'], row['end_time']

        # Iterate through each timestamp within the interval
        for time in pd.date_range(start=start_time, end=end_time, freq='H'):
            # Check if there's a corresponding timestamp in C
            if time in train_c['time'].values:
                # Replace the value in B with the value from C
                value_C = train_c.loc[train_c['time'] == time, 'pv_measurement'].values[0]
                train_b_copy.loc[train_b_copy['time'] == time, 'pv_measurement'] = value_C


    # Replace the constant 0.0 values from line 19140 to 20143 in dataset B with the values from the same timestamp in dataset C.

    # Iterate through each timestamp within the interval
    for time in pd.date_range(start='2021-03-08 15:00:00', end='2021-04-19 11:00:00', freq='H'):
        # Check if there's a corresponding timestamp in C
        if time in train_c['time'].values:
            # Replace the value in B with the value from C
            value_C = train_c.loc[train_c['time'] == time, 'pv_measurement'].values[0]
            train_b_copy.loc[train_b_copy['time'] == time, 'pv_measurement'] = value_C

    return train_b_copy

# Replace values in dataset B using dataset C for the identified constant intervals
train_b_modified = replace_with_values_from_C(train_b, train_c, long_constant_intervals_B)

display(train_b)
display(train_b_modified)

train_b = train_b_modified.drop(columns=['diff'])

# fig, axs = plt.subplots(3, 1, figsize=(20, 10), sharex=True)
# axs[0].plot(train_b_modified['time'], train_b_modified['pv_measurement'])
# axs[0].set_title('Dataset B (Modified)')
# axs[1].plot(train_c['time'], train_c['pv_measurement'])
# axs[1].set_title('Dataset C')
# axs[2].plot(train_b['time'], train_b['pv_measurement'])
# axs[2].set_title('Dataset B')
# plt.show()

Unnamed: 0,time,pv_measurement,diff
0,2018-12-31 23:00:00,0.000000,
1,2019-01-01 00:00:00,0.000000,0.000000
2,2019-01-01 01:00:00,0.000000,0.000000
3,2019-01-01 02:00:00,0.000000,0.000000
4,2019-01-01 03:00:00,0.000000,0.000000
...,...,...,...
32843,2023-04-30 19:00:00,0.828587,-44.434448
32844,2023-04-30 20:00:00,-0.000000,-0.828587
32845,2023-04-30 21:00:00,-0.000000,0.000000
32846,2023-04-30 22:00:00,-0.000000,0.000000


Unnamed: 0,time,pv_measurement,diff
0,2018-12-31 23:00:00,0.000000,
1,2019-01-01 00:00:00,0.000000,0.000000
2,2019-01-01 01:00:00,0.000000,0.000000
3,2019-01-01 02:00:00,0.000000,0.000000
4,2019-01-01 03:00:00,0.000000,0.000000
...,...,...,...
32843,2023-04-30 19:00:00,0.828587,-44.434448
32844,2023-04-30 20:00:00,-0.000000,-0.828587
32845,2023-04-30 21:00:00,-0.000000,0.000000
32846,2023-04-30 22:00:00,-0.000000,0.000000


In [69]:
train_b.head()

Unnamed: 0,time,pv_measurement
0,2018-12-31 23:00:00,0.0
1,2019-01-01 00:00:00,0.0
2,2019-01-01 01:00:00,0.0
3,2019-01-01 02:00:00,0.0
4,2019-01-01 03:00:00,0.0


# Fix metric on ceiling_height_agl:m and cloud_base_agl:m

Found that most likely the **ceiling_height_agl:m** and **cloud_base_agl:m** starts with values in meters, and then suddenly switches into using feet at 26.03.2020. Therefore I transform the values before 26.03.2020 into feet. 

In [70]:
mask_a = X_train_observed_a['date_forecast'] < '2020-03-26'
mask_b = X_train_observed_b['date_forecast'] < '2020-03-26'
mask_c = X_train_observed_c['date_forecast'] < '2020-03-26'

# Apply the conversion from meters to feet (1 meter = 3.28084 feet)
conversion_factor = 3.28084

X_train_observed_a.loc[mask_a, 'ceiling_height_agl:m'] *= conversion_factor
X_train_observed_b.loc[mask_b, 'ceiling_height_agl:m'] *= conversion_factor
X_train_observed_c.loc[mask_c, 'ceiling_height_agl:m'] *= conversion_factor

X_train_observed_a.loc[mask_a, 'cloud_base_agl:m'] *= conversion_factor
X_train_observed_b.loc[mask_b, 'cloud_base_agl:m'] *= conversion_factor
X_train_observed_c.loc[mask_c, 'cloud_base_agl:m'] *= conversion_factor

As the features have a 0.83 correlation, I choose to go forward with ceiling height

# Concat data

In [71]:

#add location to each sample
train_a["location"] = "A"
train_b["location"] = "B"
train_c["location"] = "C"
X_train_estimated_a["location"] = "A"
X_train_estimated_b["location"] = "B"
X_train_estimated_c["location"] = "C"
X_train_observed_a["location"] = "A"
X_train_observed_b["location"] = "B"
X_train_observed_c["location"] = "C"
X_test_estimated_a["location"] = "A"
X_test_estimated_b["location"] = "B"
X_test_estimated_c["location"] = "C"

#copy 23:45 value to 00:00 as it isn't there
def fill_last(frame):
    copy = frame.copy()

    copy['date_forecast'] = pd.to_datetime(copy['date_forecast'])
    
    copy["date_forecast"] = copy["date_forecast"] + pd.Timedelta(minutes=15)
    
    copy = copy[copy["date_forecast"].apply(lambda time : time.hour == 00 and time.minute == 00)]

    copy = copy[frame.columns]
    
    frame = pd.concat([
        frame,
        copy.astype(frame.dtypes)
    ],)

    frame = frame.drop_duplicates(subset="date_forecast", keep="first")

    return frame

#fill last
X_test_estimated_a = fill_last(X_test_estimated_a.copy())
X_test_estimated_b = fill_last(X_test_estimated_b.copy())
X_test_estimated_c = fill_last(X_test_estimated_c.copy())

#remove extra minute 00 sample
X_train_observed_a = X_train_observed_a.iloc[:-1,:]
X_train_observed_b = X_train_observed_b.iloc[:-1,:]
X_train_observed_c = X_train_observed_c.iloc[:-1,:]

#add date_calc column same as date_forecast column to observed data
X_train_observed_a.insert(0, "date_calc", X_train_observed_a["date_forecast"])
X_train_observed_b.insert(0, "date_calc", X_train_observed_b["date_forecast"])
X_train_observed_c.insert(0, "date_calc", X_train_observed_c["date_forecast"])

#concat all the samples and remove date_calc column
X_train_raw = pd.concat([X_train_observed_a,
                     X_train_observed_b,
                     X_train_observed_c,
                     X_train_estimated_a,
                     X_train_estimated_b,
                     X_train_estimated_c,
                     X_test_estimated_a,
                     X_test_estimated_b,
                     X_test_estimated_c])


## Clean and preprocess data

In [72]:

#map snow density to one and zero
X_train_raw["snow_density:kgm3"] = X_train_raw["snow_density:kgm3"].apply(lambda a : np.isnan(a)).map({True: 0, False: 1})

#fix ceiling_height NaN values to -666 because the docs hints to it
#also rename the features with their proper metric and remove the old ones
X_train_raw["ceiling_height_agl:ft"] = X_train_raw["ceiling_height_agl:m"].fillna(-666)
X_train_raw["cloud_base_agl:ft"] = X_train_raw["cloud_base_agl:m"].fillna(-666)
X_train_raw.drop(columns=["ceiling_height_agl:m", "cloud_base_agl:m"], inplace=True)

#categorizing ceiling_height_agl:ft
#found categories on google
X_train_raw['ceiling_height_agl:ft'] = pd.cut(X_train_raw['ceiling_height_agl:ft'], bins=[float('-inf'), 0, 500, 1000, 3000, 5000, 12000, float('inf')], labels=[-666, 1, 2, 3, 4, 5, 6])

#decided to drop cloud_base_agl:ft because of high correlation to ceiling_height_agl:ft
X_train_raw.drop(columns=["cloud_base_agl:ft"], inplace=True)

#categorizing dew_or_rime:idx
dew_or_rime_categories = {
    -1.0: "Rime",
    0.0: "None",
    1.0: "Dew"
}
X_train_raw['dew_or_rime:idx'] = X_train_raw['dew_or_rime:idx'].map(dew_or_rime_categories)

#casting floats to int for categorical features
X_train_raw['is_day:idx'] = X_train_raw['is_day:idx'].astype(int)

#remove some weird artifacts from train_b target values
train_b = pd.concat([train_b[:18690], train_b[20142:]])
train_b["rolling"] = train_b["pv_measurement"].rolling(4).mean()
train_b["keep"] = train_b["pv_measurement"] - train_b["rolling"] != 0 + train_b["pv_measurement"].apply(lambda a: a==0)
train_b = train_b[train_b["keep"]]
train_b = train_b.iloc[:,:3]

parse_dates = ['time']
X_test_targets = pd.read_csv("test.csv", parse_dates=parse_dates)

train_a["id"] = -10
train_b["id"] = -10
train_c["id"] = -10

X_test_targets = X_test_targets.rename(columns = {"prediction" : "pv_measurement"})

targets = pd.concat([train_a,
                     train_b.astype(train_a.dtypes),
                     train_c.astype(train_a.dtypes),
                     X_test_targets.astype(train_a.dtypes)]).dropna()


## Split into timeframes

In [73]:

features00 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 0)].copy()
features00["merge_time"] = features00["date_forecast"]


features15 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 15)].copy()
features15["merge_time"] = features15["date_forecast"] + pd.Timedelta(minutes=-15)


features30 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 30)].copy()
features30["merge_time"] = features30["date_forecast"] + pd.Timedelta(minutes=-30)


features45 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 45)].copy()
features45["merge_time"] = features45["date_forecast"] + pd.Timedelta(minutes=-45)


X_train_raw["date_forecast"] = X_train_raw["date_forecast"] + pd.Timedelta(minutes = -60)
features60 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 00)].copy()
features60["merge_time"] = features60["date_forecast"]

dataset = targets
dataset = dataset.rename(columns={"time": "merge_time"})

display(X_train_raw)

Unnamed: 0,date_calc,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,clear_sky_energy_1h:J,clear_sky_rad:W,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,location,ceiling_height_agl:ft
0,2019-06-02 22:00:00,2019-06-02 21:00:00,7.7,1.230,0.0,0.0,,280.299988,0.0,0.0,0.0,0.0,98.699997,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1006.799988,0.0,0.0,994.200012,1000.299988,0.0,0.0,73.099998,1006.299988,0,0.0,0.0,-0.0,0.1,342.834015,-3.202,0.0,285.899994,100.000000,39640.101562,3.7,-3.6,-0.8,-0.0,A,5
1,2019-06-02 22:15:00,2019-06-02 21:15:00,7.7,1.229,0.0,0.0,,280.299988,0.0,0.0,0.0,0.0,99.000000,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1006.500000,0.0,0.0,993.900024,999.900024,0.0,0.0,72.199997,1006.000000,0,0.0,0.0,-0.0,0.2,346.294006,-3.650,0.0,286.100006,100.000000,40123.898438,3.6,-3.6,-0.6,-0.0,A,5
2,2019-06-02 22:30:00,2019-06-02 21:30:00,7.7,1.228,0.0,0.0,,280.299988,0.0,0.0,0.0,0.0,99.199997,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1006.099976,0.0,0.0,993.599976,999.599976,0.0,0.0,71.199997,1005.599976,0,0.0,0.0,-0.0,0.2,349.768005,-3.998,0.0,286.299988,100.000000,40628.300781,3.6,-3.6,-0.4,-0.0,A,5
3,2019-06-02 22:45:00,2019-06-02 21:45:00,7.7,1.226,0.0,0.0,,280.299988,0.0,0.0,0.0,0.0,99.400002,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1005.799988,0.0,0.0,993.299988,999.299988,0.0,0.0,70.199997,1005.299988,0,0.0,0.0,-0.0,0.2,353.251007,-4.247,0.0,286.600006,100.000000,41153.601562,3.5,-3.5,-0.2,-0.0,A,5
4,2019-06-02 23:00:00,2019-06-02 22:00:00,7.7,1.225,0.0,0.0,,280.299988,0.0,0.0,0.0,0.0,99.599998,6.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1005.500000,0.0,0.0,993.000000,999.000000,0.0,0.0,69.199997,1005.000000,0,0.0,0.0,-0.0,0.2,356.742004,-4.393,0.0,286.799988,100.000000,41699.898438,3.5,-3.5,0.0,-0.0,A,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2207,2023-06-18 07:00:05,2023-06-19 23:00:00,10.7,1.193,0.0,0.0,,285.600006,0.0,0.0,0.0,0.0,27.799999,24.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1011.000000,0.0,0.0,995.299988,1001.200012,0.0,0.0,59.200001,1007.000000,0,0.0,0.0,-0.0,0.0,6.356000,-3.011,0.0,293.700012,43.500000,46996.800781,4.0,-2.0,3.5,-0.0,C,5
2303,2023-06-21 07:00:30,2023-06-22 23:00:00,9.4,1.228,0.0,0.0,,283.299988,0.0,0.0,0.0,0.0,95.599998,24.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1013.900024,0.0,0.0,997.799988,1003.799988,0.0,0.0,83.199997,1009.900024,0,0.0,0.0,-0.0,0.0,6.206000,-2.996,0.0,285.399994,96.800003,33542.898438,1.3,-0.6,1.1,-0.0,C,3
2399,2023-06-25 07:01:23,2023-06-26 23:00:00,9.8,1.187,0.0,0.0,,284.200012,0.0,0.0,0.0,0.0,2.000000,24.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1004.799988,0.0,0.0,989.200012,995.000000,0.0,0.0,53.099998,1000.799988,0,0.0,0.0,-0.0,0.0,6.015000,-3.071,0.0,295.500000,2.000000,48980.699219,3.1,-2.0,2.5,-0.0,C,-666
2687,2023-06-29 07:00:05,2023-06-30 23:00:00,9.3,1.220,0.0,0.0,,283.100006,0.0,0.0,0.0,0.0,100.000000,24.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1004.000000,0.0,0.0,989.200012,995.200012,0.0,0.1,88.800003,1001.200012,0,0.0,0.0,-0.0,0.2,5.837000,-3.254,0.1,284.399994,100.000000,9935.700195,1.8,1.6,-0.9,0.0,C,3


# Define helper functions

In [74]:
#averages the features meassured at target time +00, +15, +30, +45 and +60
def add_feature_average_00_60(dataset, f00, f15, f30, f45, f60, column_name, categorical=False):
    dataset = pd.merge(
        left=dataset,
        right = f00[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner")
    dataset = pd.merge(
        left=dataset,
        right = f15[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner",
        suffixes=["", "_15"])
    dataset = pd.merge(
        left=dataset,
        right = f30[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner",
        suffixes=["", "_30"])
    dataset = pd.merge(
        left=dataset,
        right = f45[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner",
        suffixes=["", "_45"])
    dataset = pd.merge(
        left=dataset,
        right = f60[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner",
        suffixes=["", "_60"])


    dataset[column_name] = (dataset[column_name] +
                            dataset[column_name + "_15"] +
                            dataset[column_name + "_30"] +
                            dataset[column_name + "_45"] +
                            dataset[column_name + "_60"])/5
    dataset = dataset.drop([column_name + "_15",
                            column_name + "_30",
                            column_name + "_45",
                            column_name + "_60"],
                           axis=1)
    if categorical:
        add_to_categorical_features(column_name)
    else:
        add_to_numerical_features(column_name)

    return dataset

#adds a single feature from one observation
def add_feature(dataset, f, column_name, count=True, categorical=False):
    if count:
        if categorical:
            add_to_categorical_features(column_name)
        else:
            add_to_numerical_features(column_name)
    return pd.merge(
            left=dataset,
            right=f[["location", "merge_time", column_name]],
            on=["location", "merge_time"],
            how="inner"
  )

#adds an One Hot Encoding of the column to the dataset
def OHE(dataset, f, column_name, suffix=""):

    dataset = pd.merge(
        left=dataset,
        right = f[["location", "merge_time", column_name]],
        on=["location", "merge_time"],
        how="inner")

    values = dataset[column_name].unique()

    for value in values:
        dataset[column_name + "_" + suffix + str(value)] = dataset[column_name].apply(lambda a : a == value).map({True: 1, False: 0})

    dataset = dataset.drop([column_name], axis=1)
    return dataset

def OHE_all(dataset, f00, f15, f30, f45, f60, column_name):
    dataset = OHE(dataset, f00, column_name, suffix="00_")
    dataset = OHE(dataset, f15, column_name, suffix="15_")
    dataset = OHE(dataset, f30, column_name, suffix="30_")
    dataset = OHE(dataset, f45, column_name, suffix="45_")
    dataset = OHE(dataset, f60, column_name, suffix="60_")

    return dataset

#adds all observations
def add_all(dataset, f00, f15, f30, f45, f60, column_name, categorical=False):
    dataset[column_name + "_00"] = add_feature(dataset, f00, column_name, count=False,categorical=categorical)[column_name]
    dataset[column_name + "_15"] = add_feature(dataset, f15, column_name, count=False, categorical=categorical)[column_name]
    dataset[column_name + "_30"] = add_feature(dataset, f30, column_name, count=False, categorical=categorical)[column_name]
    dataset[column_name + "_45"] = add_feature(dataset, f45, column_name, count=False, categorical=categorical)[column_name]
    dataset[column_name + "_60"] = add_feature(dataset, f60, column_name, count=False, categorical=categorical)[column_name]

    feature_names = [column_name + "_00", column_name + "_15", column_name + "_30", column_name + "_45", column_name + "_60"]
    for feat in feature_names:
        if categorical:
            add_to_categorical_features(feat)
        else:
            add_to_numerical_features(feat)

    return dataset

#finds mode (typetall) for a row
def find_mode_with_priority(row, priority_list, max=False):
    # Check for prioritized values in the row and set it as the mode if it is
    for value in priority_list:
        if value in row.values:
            return value

    # Calculate the mode for the row
    if max:
        mode_value = row.mode().max()
    else:
        mode_value = row.mode().min()

    return mode_value

def add_most_frequent_feature(dataset, f15, f30, f45, f60, column_name, priority_list, max=False, categorical=True):
    dataset[column_name + "_15"] = add_feature(dataset, f15, column_name, count=False)[column_name]
    dataset[column_name + "_30"] = add_feature(dataset, f30, column_name, count=False)[column_name]
    dataset[column_name + "_45"] = add_feature(dataset, f45, column_name, count=False)[column_name]
    dataset[column_name + "_60"] = add_feature(dataset, f60, column_name, count=False)[column_name]

    feature_names = [column_name + "_15", column_name + "_30", column_name + "_45", column_name + "_60"]

    dataset[column_name] = dataset[feature_names].apply(find_mode_with_priority, args=(priority_list, max), axis=1)

    dataset = dataset.drop([column_name + "_15",
                            column_name + "_30",
                            column_name + "_45",
                            column_name + "_60"],
                           axis=1)

    if categorical:
        add_to_categorical_features(column_name)
    else:
        add_to_numerical_features(column_name)

    return dataset

def add_accumulated(dataset, f15, f30, f45, f60, column_name, time_interval, categorical=False):
    dataset[column_name + "_15"] = add_feature(dataset, f15, column_name, count=False)[column_name]
    dataset[column_name + "_30"] = add_feature(dataset, f30, column_name, count=False)[column_name]
    dataset[column_name + "_45"] = add_feature(dataset, f45, column_name, count=False)[column_name]
    dataset[column_name + "_60"] = add_feature(dataset, f60, column_name, count=False)[column_name]

    time_multiplier = 15/time_interval

    feature_names = [column_name + "_15", column_name + "_30", column_name + "_45", column_name + "_60"]

    dataset[column_name] = dataset[feature_names].sum(axis=1)*time_multiplier

    dataset = dataset.drop([column_name + "_15",
                            column_name + "_30",
                            column_name + "_45",
                            column_name + "_60"],
                           axis=1)
    if categorical:
        add_to_categorical_features(column_name)
    else:
        add_to_numerical_features(column_name)

    return dataset

def add_accumulated_all(dataset, f15, f30, f45, f60, column_name, time_interval, categorical=False):
    dataset[column_name + "_15"] = add_feature(dataset, f15, column_name)[column_name]
    dataset[column_name + "_30"] = add_feature(dataset, f30, column_name)[column_name]
    dataset[column_name + "_45"] = add_feature(dataset, f45, column_name)[column_name]
    dataset[column_name + "_60"] = add_feature(dataset, f60, column_name)[column_name]

    time_multiplier = 15/time_interval

    feature_names = [column_name + "_15", column_name + "_30", column_name + "_45", column_name + "_60"]

    for feat in feature_names:
        dataset[feat] = dataset[feat]*time_multiplier
    if categorical:
        add_to_categorical_features(column_name)    
    else:
        add_to_numerical_features(column_name)

    return dataset

numerical_feature_names = []
categorical_feature_names = []

def add_to_numerical_features(feature_name):
    numerical_feature_names.append(feature_name)

def add_to_categorical_features(feature_name):
    categorical_feature_names.append(feature_name)


# Feature info

### Features removed because of correlations:

- **fresh_snow_12h:cm**: fresh_snow_24h:cm = **0.82**

- **fresh_snow_3h:cm**: fresh_snow_1h:cm = **0.81**

- **fresh_snow_6h:cm**: fresh_snow_24h:cm = **0.83**

- **diffuse_rad:W**: diffuse_rad_1h:J = **0.99**

- **direct_rad:W**: direct_rad_1h:J = **0.99**

- **pressure_100m:hPa**: msl_pressure:hPa = **1.00**

- **pressure_50m:hPa**: msl_pressure:hPa = **1.00**

- **sfc_pressure:hPa**: msl_pressure:hPa = **1.00**

- **absolute_humidity_2m:gm3**: t1000:hPa = **0.90**

- **air_density_2m:kgm3**: t1000:hPa = **0.90**

- **dew_point_2m:K**: t1000:hPa = **0.91**

- **clear_sky_rad:W**: sun_elevation = **0.83**

- **clear_sky_energy_1h:J**: sun_elevation = **0.82**

- **total_cloud_cover:p**: effective_cloud_cover:p = **0.94**

- **cloud_base_agl:m**: ceiling_height_agl:m = **0.83** (after recalculating to feet)

- **elevation:m**: highly correlated to location

- **is_in_shadow:idx**: highly correlated to is_day:idx

### High correlations still in the set:

- **is_day:idx**: sun_elevation = **0.81**

- **diffuse_rad_1h:J**: sun_elevation = **0.80**

### Other features removed:

- **snow_drift:idx**: Almost exclusively 0, so doesn't add much data

- **wind_speed_w_1000hPa:ms**: In relation to the dummy data found in the docs this data is really weird (binary as opposed to continous values)


# Add numerical features

In [75]:
# SNOW AND PRECIPITATION

# tar verdi fra +60 siden den viser måling mellom 00 og 60, #!kan det være gunstig å ha med 3h, 6h, 12h????
# dataset = add_feature(dataset, features60, "fresh_snow_1h:cm")
# dataset = add_feature(dataset, features60, "fresh_snow_24h:cm")

# bruker bare 24h fordi den har høyest korrelasjon med pv_measurement
dataset = add_feature(dataset, features60, "fresh_snow_24h:cm")

# tar alle akkumulerte verdier og ganger med 3 for å få en bedre verdi (ikke helt etter boka menmen...)
dataset = add_all(dataset, features00, features15, features30, features45, features60, "precip_5min:mm")
# disse tar jeg bare gjennomsnittet av
#dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "snow_depth:cm")
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "snow_water:kgm2")
# tar akkumulert verdi og ganger med 3/2 for å få en bedre verdi (ikke helt etter boka menmen...)
#dataset = add_accumulated(dataset, features15, features30, features45, features60, "snow_melt_10min:mm", 10)


# ACCUMULATIVE FEATURES

# tar verdi fra +60 siden den viser måling mellom 00 og 60
dataset = add_feature(dataset, features60, "diffuse_rad_1h:J")
# tar verdi fra +60 siden den viser måling mellom 00 og 60
dataset = add_feature(dataset, features60, "direct_rad_1h:J")#!Try without

# PRESSURE

# tar gjennomsnittet da dette er punktmålinger ##kan hende denne burde kjøres per kvarter
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "msl_pressure:hPa")


# TEMPERATURE

# gjennomsnitt siden variasjonen hvert kvarter sannsynligvis er lav? ##kan hende denne burde kjøres per kvarter
# update: kjører hvert kvarter
dataset = add_all(dataset, features00, features15, features30, features45, features60, "t_1000hPa:K")


# SUN

#legger til alle siden har testing har vist at disse er svært viktige
dataset = add_all(dataset, features00, features15, features30, features45, features60, "sun_azimuth:d")
dataset = add_all(dataset, features00, features15, features30, features45, features60, "sun_elevation:d")

#tar verdien fra +60 siden den viser måling mellom 00 og 60
dataset = add_feature(dataset, features60, "clear_sky_energy_1h:J")


# DAY AND SHADOW

#gjennomsnitt fordi jeg vet ikke
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "visibility:m")


# CLOUDS

#gjennomsnitt fordi verdien er trolig momentan
#SANNSYNLIGVIS VELDIG VIKTIG
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "effective_cloud_cover:p")


# HUMIDITY AND RIME

#tar gjennomsnitt fordi jeg vet ikke #!diskuter
#dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "prob_rime:p")
#tar gjennomsnitt fordi jeg vet ikke #!diskuter
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "relative_humidity_1000hPa:p")


# WIND

# Gjennomsnitt fordi lite variabel #! try without
# dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "wind_speed_u_10m:ms")
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "wind_speed_v_10m:ms")
# dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "wind_speed_10m:ms")


# OTHERS (Up for discussion)
# Gjennomsnitt fordi?
dataset = add_feature_average_00_60(dataset, features00, features15, features30, features45, features60, "super_cooled_liquid_water:kgm2")

# Standardize data

In [76]:
# make trig transform on solar angles
def apply_trig(dataset, feature_name, suffixes):
    for suffix in suffixes:
        column_name = f"{feature_name}_{suffix}"
        if column_name in dataset:
            dataset[column_name] = dataset[column_name].apply(lambda d: np.cos((d * np.pi) / 180))

# apply trig transform on solar angles
suffixes = ['00', '15', '30', '45', '60']
apply_trig(dataset, 'sun_azimuth:d', suffixes)
apply_trig(dataset, 'sun_elevation:d', suffixes)

# # all features except pv_measurement, merge_time, location and id
# columns_to_exclude = ['merge_time', 'pv_measurement', 'location', 'id']
# # standardize the features
# scaler = StandardScaler()
# scaled_columns = scaler.fit_transform(dataset.drop(columns=columns_to_exclude))
# scaled_dataset = pd.DataFrame(scaled_columns, columns=dataset.drop(columns=columns_to_exclude).columns)

# # add the excluded columns back to the dataset
# dataset = pd.concat([dataset[columns_to_exclude], scaled_dataset], axis=1)


## Trying first without standardization


# def standardize(dataset, feature_name, minus_min=False):
#     if minus_min:
#         dataset[feature_name] = (dataset[feature_name] - dataset[feature_name].min())/dataset[feature_name].std()
#     else:
#         dataset[feature_name] = dataset[feature_name]/dataset[feature_name].std()
    
#     return dataset

# def standardize_all(dataset, feature_name, accumulated=False, minus_min=False):
#     if not accumulated:
#         dataset[feature_name + "_00"] = standardize(dataset, feature_name + "_00")[feature_name + "_00"]

#     dataset[feature_name + "_15"] = standardize(dataset, feature_name + "_15", minus_min=minus_min)[feature_name + "_15"]
#     dataset[feature_name + "_30"] = standardize(dataset, feature_name + "_30", minus_min=minus_min)[feature_name + "_30"]
#     dataset[feature_name + "_45"] = standardize(dataset, feature_name + "_45", minus_min=minus_min)[feature_name + "_45"]
#     dataset[feature_name + "_60"] = standardize(dataset, feature_name + "_60", minus_min=minus_min)[feature_name + "_60"]

#     return dataset

# # standardize the features
# dataset = standardize(dataset, 'fresh_snow_24h:cm')
# dataset = standardize_all(dataset, 'precip_5min:mm')
# #dataset = standardize(dataset, 'snow_depth:cm')
# dataset = standardize(dataset, 'snow_water:kgm2')
# #dataset = standardize(dataset, 'snow_melt_10min:mm')
# dataset = standardize(dataset, 'diffuse_rad_1h:J')
# dataset = standardize(dataset, 'direct_rad_1h:J')
# dataset = standardize(dataset, 'msl_pressure:hPa', minus_min=True)
# dataset = standardize_all(dataset, 't_1000hPa:K', minus_min=True)
# dataset = standardize_all(dataset, 'sun_azimuth:d')
# dataset = standardize_all(dataset, 'sun_elevation:d')
# dataset = standardize(dataset, 'clear_sky_energy_1h:J')
# dataset = standardize(dataset, 'visibility:m')
# dataset = standardize(dataset, 'effective_cloud_cover:p')
# # dataset = standardize(dataset, 'prob_rime:p')
# dataset = standardize(dataset, 'relative_humidity_1000hPa:p')
# # dataset = standardize(dataset, 'wind_speed_u_10m:ms')
# dataset = standardize(dataset, 'wind_speed_v_10m:ms')
# # dataset = standardize(dataset, 'wind_speed_10m:ms')
# dataset = standardize(dataset, 'super_cooled_liquid_water:kgm2')


# Add categorical features

In [77]:
# reason I'm adding it here is to prevent the massive -666 value getting in the way of the scaling
# add most frequent bc don't wanna have 100 000 features, also don't add priority list and pick the min value if conflict
dataset = add_most_frequent_feature(dataset, features00, features15, features30, features45, "ceiling_height_agl:ft", priority_list=[], max=False)
#! test this aswell
# dataset = add_all(dataset, features00, features15, features30, features45, features60, "ceiling_height_agl:ft")

# add most frequent bc don't wanna have 100 000 features
dataset = add_most_frequent_feature(dataset, features15, features30, features45, features60, "precip_type_5min:idx", priority_list=[3,4,2], max=True)
#! test this aswell
# dataset = add_all(dataset, features00, features15, features30, features45, features60, "precip_type_5min:idx")

# categorizing ceiling_height_agl:ft
# found categories on google
labels=["VFR1", "LIFR", "IFR", "MVFR", "VFR4", "VFR3", "VFR2"]
ceiling_height_agl_ft_categories = {
    -666: "VFR1",
    1: "LIFR",
    2: "IFR",
    3: "MVFR",
    4: "VFR4",
    5: "VFR3",
    6: "VFR2",
}
# map the values to their labels
dataset['ceiling_height_agl:ft'] = dataset['ceiling_height_agl:ft'].map(ceiling_height_agl_ft_categories)

#categorizing precip_type_5min:idx
#found this in the docs
precip_types = {
    0: "None",
    1: "Rain",
    2: "Rain_and_snow_mixed",
    3: "Snow",
    4: "Sleet",
    5: "Freezing_rain",
    6: "Hail",
}
#map the values to their labels
dataset['precip_type_5min:idx'] = dataset['precip_type_5min:idx'].map(precip_types)
    
# Add feature from 60 because pretty consistent
dataset = add_feature(dataset, features60, "snow_density:kgm3", categorical=True)

# OHE av kategorisk variabel #!Opp til diskusjon om man skal ta gjennomsnitt eller flere av målingene
dataset = add_all(dataset, features00, features15, features30, features45, features60, "dew_or_rime:idx", categorical=True)
# Husk at denne er mappet til strings lenger opp

# tar alle verdiene siden disse nok er ekstremt viktige for modellen og gir ikke mening å standardisere
dataset = add_all(dataset, features00, features15, features30, features45, features60, "is_day:idx", categorical=True)

#mapper location til tall og legger til som kategorisk variabel
# location_mapping = {
#     "A": 1,
#     "B": 2,
#     "C": 3
# }
# dataset["location"] = dataset["location"].map(location_mapping)
add_to_categorical_features("location")

# 4 precip_5min:mm values are NaN, set them to 0 because precip_type is 0.0 (None)
# 3 precip_type:idx values are NaN, set them to 0.0 (None)
dataset = dataset.fillna(0.0)

In [78]:
display(dataset.head())
print(dataset.drop(columns=['merge_time', 'pv_measurement']).shape)
print(len(numerical_feature_names))
print(len(categorical_feature_names))
print(len(numerical_feature_names) + len(categorical_feature_names))

Unnamed: 0,merge_time,pv_measurement,location,id,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,ceiling_height_agl:ft,precip_type_5min:idx,snow_density:kgm3,dew_or_rime:idx_00,dew_or_rime:idx_15,dew_or_rime:idx_30,dew_or_rime:idx_45,dew_or_rime:idx_60,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60
0,2019-06-02 22:00:00,0.0,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.0,0.0,1006.140015,285.899994,286.100006,286.299988,286.600006,286.799988,0.955454,0.971524,0.984097,0.993071,0.998384,0.998439,0.997972,0.997566,0.997254,0.997062,0.0,40649.164062,99.18,71.179993,-0.4,0.0,VFR3,,0,,,,,,0,0,0,0,0
1,2019-06-02 23:00:00,0.0,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,1005.079956,286.799988,286.899994,286.899994,287.0,287.0,0.998384,0.999992,0.997884,0.992075,0.985583,0.997062,0.997002,0.997081,0.99729,0.997672,0.0,31111.119141,99.799995,67.599998,0.36,0.0,VFR3,,0,,,,,,0,0,0,0,0
2,2019-06-03 00:00:00,0.0,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.0,1004.5,287.0,287.0,286.899994,286.899994,286.899994,0.985583,0.973531,0.957968,0.939004,0.916774,0.997672,0.998054,0.998497,0.998962,0.999399,0.0,11297.320312,100.0,68.580002,0.76,0.0,VFR3,,0,,,,,,0,0,0,0,0
3,2019-06-03 01:00:00,0.0,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.5,7743.299805,0.0,1003.900024,286.899994,286.799988,286.700012,286.600006,286.5,0.916774,0.891418,0.863096,0.831973,0.798215,0.999399,0.999755,0.999967,0.999972,0.999701,6546.899902,2393.800049,100.0,74.800003,0.9,0.0,VFR4,,0,,,,,,0,0,0,1,1
4,2019-06-03 02:00:00,19.36,A,-10,0.0,0.0,0.0,0.0,0.0,0.0,0.22,60137.601562,3158.300049,1003.0,286.5,286.5,286.399994,286.399994,286.399994,0.798215,0.762002,0.723485,0.68284,0.640204,0.999701,0.999084,0.998051,0.996532,0.994462,102225.898438,14631.379883,79.659996,80.419998,0.92,0.0,VFR4,,0,,,,,,1,1,1,1,1


(93533, 46)
31
14
45


In [79]:
dataset.isna().sum()

merge_time                        0
pv_measurement                    0
location                          0
id                                0
fresh_snow_24h:cm                 0
precip_5min:mm_00                 0
precip_5min:mm_15                 0
precip_5min:mm_30                 0
precip_5min:mm_45                 0
precip_5min:mm_60                 0
snow_water:kgm2                   0
diffuse_rad_1h:J                  0
direct_rad_1h:J                   0
msl_pressure:hPa                  0
t_1000hPa:K_00                    0
t_1000hPa:K_15                    0
t_1000hPa:K_30                    0
t_1000hPa:K_45                    0
t_1000hPa:K_60                    0
sun_azimuth:d_00                  0
sun_azimuth:d_15                  0
sun_azimuth:d_30                  0
sun_azimuth:d_45                  0
sun_azimuth:d_60                  0
sun_elevation:d_00                0
sun_elevation:d_15                0
sun_elevation:d_30                0
sun_elevation:d_45          

# Split into training set and test set

In [80]:
testset = dataset[dataset["id"].apply(lambda id: id != -10)]
testset = testset.drop(columns=["id", "pv_measurement"])

In [81]:
from sklearn.model_selection import train_test_split

# extract real dataset
dataset = dataset[dataset["id"].apply(lambda id: id == -10)]
dataset = dataset.drop("id", axis=1)

# dataset = dataset.sort_values(by="merge_time")

# randomly split dataset into train and eval
trainset, evalset = train_test_split(dataset, test_size=0.2, random_state=42)



# split into X and Y
trainsetX = trainset.drop(columns=["merge_time", "pv_measurement"])
trainsetY = trainset["pv_measurement"]
evalsetX = evalset.drop(columns=["merge_time", "pv_measurement"])
evalsetY = evalset["pv_measurement"]

display(trainsetX)
display(evalsetX)
display(trainsetY)
display(evalsetY)

Unnamed: 0,location,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,ceiling_height_agl:ft,precip_type_5min:idx,snow_density:kgm3,dew_or_rime:idx_00,dew_or_rime:idx_15,dew_or_rime:idx_30,dew_or_rime:idx_45,dew_or_rime:idx_60,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60
16516,A,0.0,0.0,0.0,0.0,0.0,0.0,0.36,0.000000,0.000000,1007.640015,276.200012,276.200012,276.200012,276.200012,276.200012,0.772468,0.731723,0.688608,0.643349,0.596169,0.988415,0.991162,0.993690,0.995907,0.997723,0.000000e+00,3364.399902,99.860001,93.320007,-0.40,0.10,IFR,,0,,,,,,0,0,0,0,0
58757,B,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,995.380005,276.899994,276.799988,276.799988,276.799988,276.799988,0.384456,0.322596,0.260926,0.199642,0.138879,0.804977,0.820950,0.836650,0.851946,0.866731,0.000000e+00,45792.800781,96.900002,83.900002,3.28,0.00,VFR3,,0,,,,,,0,0,0,0,0
45947,B,0.0,0.0,0.0,0.0,0.0,0.0,0.10,691600.000000,553707.187500,1004.420044,276.799988,276.899994,277.000000,277.100006,277.200012,-0.611182,-0.553144,-0.493912,-0.433863,-0.373344,0.795600,0.809816,0.824225,0.838671,0.852987,2.109365e+06,10124.140625,91.159996,61.220001,1.60,0.06,MVFR,,0,,,,,,1,1,1,1,1
29556,A,0.0,0.0,0.0,0.0,0.0,0.0,0.00,289806.687500,587535.500000,996.360046,281.200012,281.399994,281.500000,281.700012,281.899994,-0.957325,-0.974597,-0.987513,-0.995927,-0.999753,0.957838,0.955644,0.953984,0.952884,0.952385,9.384382e+05,52153.957031,34.299999,71.480003,-0.70,0.00,VFR3,,0,,,,,,1,1,1,1,1
25171,A,0.0,0.0,0.0,0.0,0.0,0.0,0.00,145045.906250,176310.796875,1032.560059,281.399994,281.299988,281.100006,281.000000,280.899994,0.008482,0.066936,0.124952,0.182424,0.239245,0.981621,0.986794,0.991104,0.994546,0.997127,2.901458e+05,39302.335938,2.320000,57.599998,-1.64,0.00,VFR1,,0,,,,,,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,A,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,990.520020,276.700012,276.700012,276.700012,276.700012,276.700012,0.987436,0.997041,0.999987,0.996221,0.989185,0.785846,0.783715,0.783053,0.783899,0.789266,0.000000e+00,16535.500000,97.719994,90.059998,2.30,0.12,IFR,,0,,,,,,0,0,0,0,0
54886,B,0.0,0.0,0.0,0.0,0.0,0.0,0.00,327360.906250,68121.296875,1020.799988,284.399994,284.399994,284.299988,284.200012,284.200012,0.066134,0.123567,0.180125,0.235736,0.290351,0.934267,0.944279,0.953428,0.961689,0.969063,9.174516e+05,11750.180664,98.240005,77.860001,-1.52,0.00,IFR,,0,,,,,,1,1,1,1,1
76820,C,2.2,0.0,0.0,0.0,0.0,0.0,0.18,31576.099609,22.900000,972.799988,280.299988,280.399994,280.500000,280.600006,280.700012,-0.997291,-0.999912,-0.998891,-0.994235,-0.985981,0.998434,0.998362,0.998390,0.998516,0.998727,8.846350e+04,60408.726562,99.400002,57.559998,5.30,0.04,VFR4,,0,,,,,,1,1,1,1,1
860,A,0.0,0.0,0.0,0.0,0.0,0.0,0.00,211914.796875,120727.296875,1016.820007,283.799988,283.799988,283.899994,283.899994,283.899994,0.320530,0.373004,0.424247,0.474165,0.522677,0.961626,0.968813,0.975130,0.980604,0.985271,5.935121e+05,30633.759766,80.599998,88.240005,-1.24,0.00,MVFR,,0,,,,,,1,1,1,1,1


Unnamed: 0,location,fresh_snow_24h:cm,precip_5min:mm_00,precip_5min:mm_15,precip_5min:mm_30,precip_5min:mm_45,precip_5min:mm_60,snow_water:kgm2,diffuse_rad_1h:J,direct_rad_1h:J,msl_pressure:hPa,t_1000hPa:K_00,t_1000hPa:K_15,t_1000hPa:K_30,t_1000hPa:K_45,t_1000hPa:K_60,sun_azimuth:d_00,sun_azimuth:d_15,sun_azimuth:d_30,sun_azimuth:d_45,sun_azimuth:d_60,sun_elevation:d_00,sun_elevation:d_15,sun_elevation:d_30,sun_elevation:d_45,sun_elevation:d_60,clear_sky_energy_1h:J,visibility:m,effective_cloud_cover:p,relative_humidity_1000hPa:p,wind_speed_v_10m:ms,super_cooled_liquid_water:kgm2,ceiling_height_agl:ft,precip_type_5min:idx,snow_density:kgm3,dew_or_rime:idx_00,dew_or_rime:idx_15,dew_or_rime:idx_30,dew_or_rime:idx_45,dew_or_rime:idx_60,is_day:idx_00,is_day:idx_15,is_day:idx_30,is_day:idx_45,is_day:idx_60
37181,B,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.000000,0.00000,1032.920044,275.500000,275.500000,275.500000,275.399994,275.399994,0.998615,0.999945,0.997295,0.990689,0.983363,0.988985,0.988894,0.989078,0.989526,0.990624,0.000,36233.101562,78.360001,69.780006,-0.14,0.00,VFR4,,0,,,,,,0,0,0,0,0
78736,C,0.0,0.0,0.0,0.00,0.00,0.0,0.00,243323.296875,315764.59375,1026.340088,270.100006,270.299988,270.600006,270.799988,271.000000,-0.359525,-0.415360,-0.470242,-0.523971,-0.576304,0.988422,0.983951,0.978907,0.973383,0.967467,547365.875,47161.039062,67.320000,63.539997,2.54,0.00,VFR3,,0,,,,,,1,1,1,1,1
64242,B,2.3,0.0,0.0,0.00,0.00,0.0,0.00,0.000000,0.00000,1012.259949,271.500000,271.399994,271.299988,271.200012,271.200012,0.992045,0.998677,0.999726,0.995176,0.988137,0.877079,0.875819,0.875625,0.876483,0.881105,0.000,30715.019531,62.480000,84.659996,2.48,0.00,VFR4,,1,,,,,,0,0,0,0,0
3431,A,0.0,0.0,0.0,0.00,0.00,0.0,0.14,0.000000,0.00000,1002.119934,283.399994,283.399994,283.399994,283.299988,283.299988,0.805173,0.847400,0.885491,0.918915,0.947143,0.833105,0.823859,0.815492,0.808124,0.801890,0.000,18953.720703,98.500008,89.440002,0.06,0.02,MVFR,,0,Dew,Dew,Dew,Dew,Dew,0,0,0,0,0
30125,A,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.000000,0.00000,1014.959961,276.200012,276.200012,276.200012,276.100006,276.100006,0.797900,0.748250,0.695461,0.640163,0.582917,0.749672,0.761844,0.774834,0.788451,0.802505,0.000,57705.335938,1.380000,53.000000,0.90,0.00,VFR1,,0,,,,,,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89403,C,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.000000,0.00000,995.640015,275.899994,275.799988,275.799988,275.700012,275.600006,0.194953,0.256003,0.317404,0.378939,0.440394,0.863844,0.849073,0.833867,0.818350,0.802661,0.000,51923.242188,52.000000,71.699997,1.56,0.00,VFR3,,0,Rime,Rime,Rime,Rime,Rime,0,0,0,0,0
63802,B,0.0,0.0,0.0,0.00,0.00,0.0,0.54,0.000000,0.00000,1005.859985,275.700012,275.799988,275.799988,275.899994,276.000000,0.717433,0.765618,0.810696,0.852147,0.889448,0.860289,0.850095,0.840453,0.831528,0.823443,0.000,5793.540039,100.000000,90.279999,2.62,0.16,LIFR,,0,,,,,,0,0,0,0,0
44924,B,0.0,0.0,0.0,0.00,0.00,0.0,0.10,0.000000,0.00000,984.099976,274.799988,274.700012,274.500000,274.399994,274.200012,0.996160,0.999848,0.998604,0.992442,0.984644,0.930775,0.930206,0.930398,0.931342,0.934931,0.000,31640.800781,95.599998,75.419998,-4.10,0.10,MVFR,,0,,,,,,0,0,0,0,0
43417,B,0.0,0.0,0.0,0.00,0.00,0.0,0.00,0.000000,0.00000,983.599976,274.899994,274.899994,274.899994,274.899994,274.899994,0.459688,0.399741,0.339411,0.278958,0.218552,0.846500,0.860306,0.873883,0.887108,0.899863,0.000,55146.679688,1.400000,65.120003,2.20,0.00,VFR3,,0,,,,,,0,0,0,0,0


16516       0.0000
58757       0.0000
45947     347.5875
29556    3483.9200
25171     347.3800
           ...    
6265        0.0000
54886      67.2750
76820       0.0000
860       244.8600
15795       0.0000
Name: pv_measurement, Length: 73098, dtype: float64

37181      0.0
78736    196.0
64242     -0.0
3431       0.0
30125      0.0
         ...  
89403      0.0
63802     -0.0
44924      0.0
43417      0.0
59210      0.0
Name: pv_measurement, Length: 18275, dtype: float64

# Making model

In [38]:
from autogluon.tabular import TabularDataset, TabularPredictor

train_data = TabularDataset(trainset)
train_data.head()

label = 'pv_measurement'
train_data[label].describe()

predictor = TabularPredictor(label=label, eval_metric='mean_absolute_error').fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20231111_184634"
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231111_184634"
AutoGluon Version:  0.8.2
Python Version:     3.10.13
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:41:43 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6000
Disk Space Avail:   28.69 GB / 494.38 GB (5.8%)
Train Data Rows:    73098
Train Data Columns: 46
Label Column: pv_measurement
Preprocessing data ...
  with pd.option_context("mode.use_inf_as_na", True):  # treat None, NaN, INF, NINF as NA
AutoGluon infers your prediction problem is: 'regressio

[1000]	valid_set's l1: 80.9087
[2000]	valid_set's l1: 76.8971
[3000]	valid_set's l1: 74.4482
[4000]	valid_set's l1: 73.0675
[5000]	valid_set's l1: 72.1626
[6000]	valid_set's l1: 71.3987
[7000]	valid_set's l1: 70.7816
[8000]	valid_set's l1: 70.2978
[9000]	valid_set's l1: 69.8783
[10000]	valid_set's l1: 69.4939


	-69.4921	 = Validation score   (-mean_absolute_error)
	50.33s	 = Training   runtime
	0.28s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's l1: 78.5206
[2000]	valid_set's l1: 75.9806
[3000]	valid_set's l1: 75.0597
[4000]	valid_set's l1: 74.5193
[5000]	valid_set's l1: 74.0353
[6000]	valid_set's l1: 73.8477
[7000]	valid_set's l1: 73.6752
[8000]	valid_set's l1: 73.53
[9000]	valid_set's l1: 73.4968
[10000]	valid_set's l1: 73.4866


	-73.4582	 = Validation score   (-mean_absolute_error)
	50.43s	 = Training   runtime
	0.33s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-83.4423	 = Validation score   (-mean_absolute_error)
	64.7s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-79.8751	 = Validation score   (-mean_absolute_error)
	301.2s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-84.9897	 = Validation score   (-mean_absolute_error)
	11.09s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-93.3857	 = Validation score   (-mean_absolute_error)
	91.5s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost ...
  if is_sparse(data):
	-82.9209	 = Validation score   (-mean_absolute_error)
	4.79s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-81.0343	 = Validation score   (-mean_absolute_error)
	490.12s	 = Training   runtime
	0.03s	 = Validatio

[1000]	valid_set's l1: 71.5241
[2000]	valid_set's l1: 69.357
[3000]	valid_set's l1: 68.6613
[4000]	valid_set's l1: 68.3465
[5000]	valid_set's l1: 68.233
[6000]	valid_set's l1: 68.1473
[7000]	valid_set's l1: 68.0973
[8000]	valid_set's l1: 68.066
[9000]	valid_set's l1: 68.053
[10000]	valid_set's l1: 68.0394


	-68.039	 = Validation score   (-mean_absolute_error)
	149.01s	 = Training   runtime
	0.58s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-66.3033	 = Validation score   (-mean_absolute_error)
	0.14s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1218.82s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231111_184634")


# Prediction

In [82]:
test_data = TabularDataset(evalset)

y_pred = predictor.predict(test_data.drop(columns=[label]))
y_pred.head()

  X.fillna(self._fillna_feature_map, inplace=True, downcast=False)


37181     0.067324
78736    42.426727
64242    -0.125686
3431     12.142344
30125     2.197257
Name: pv_measurement, dtype: float32

# Evaluation

In [83]:
predictor.evaluate(test_data, silent=True)

  X.fillna(self._fillna_feature_map, inplace=True, downcast=False)


{'mean_absolute_error': -64.08615896977179,
 'root_mean_squared_error': -204.81595513833722,
 'mean_squared_error': -41949.57547922936,
 'r2': 0.9289890461524898,
 'pearsonr': 0.9638776400816613,
 'median_absolute_error': -2.7986816406249915}

In [84]:
predictor.leaderboard(test_data, silent=True)

  X.fillna(self._fillna_feature_map, inplace=True, downcast=False)


Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-64.086159,-66.303275,6.752618,0.889692,689.59929,0.013293,0.000249,0.144688,2,True,12
1,LightGBMLarge,-65.74302,-68.039042,4.316224,0.577172,149.011756,4.316224,0.577172,149.011756,1,True,11
2,LightGBMXT,-67.745561,-69.492089,2.216821,0.281286,50.325203,2.216821,0.281286,50.325203,1,True,3
3,LightGBM,-69.571758,-73.458236,2.202698,0.332089,50.432581,2.202698,0.332089,50.432581,1,True,4
4,CatBoost,-75.873997,-79.875124,0.095884,0.016254,301.200311,0.095884,0.016254,301.200311,1,True,6
5,NeuralNetTorch,-77.65833,-81.034341,0.20628,0.030985,490.117643,0.20628,0.030985,490.117643,1,True,10
6,XGBoost,-78.180328,-82.920944,0.10921,0.017546,4.788552,0.10921,0.017546,4.788552,1,True,9
7,RandomForestMSE,-80.444029,-83.442288,0.454525,0.05336,64.70369,0.454525,0.05336,64.70369,1,True,5
8,ExtraTreesMSE,-80.462064,-84.989697,0.430685,0.053317,11.092818,0.430685,0.053317,11.092818,1,True,7
9,NeuralNetFastAI,-86.336371,-93.385655,0.293528,0.040876,91.502577,0.293528,0.040876,91.502577,1,True,8


# Predict test data

In [85]:
preds = predictor.predict(testset)
preds = preds.reset_index()
preds = preds.drop(columns=["index"])
preds = preds.rename(columns={"pv_measurement": "prediction"})
preds = preds.reset_index()
preds = preds.rename(columns={"index": "id"})

display(preds)

  X.fillna(self._fillna_feature_map, inplace=True, downcast=False)


91373      3.075854
91374      3.586959
91375      3.929909
91376     71.986893
91377    459.344910
            ...    
93528     81.747849
93529     55.291061
93530     34.913445
93531      9.329406
93532      8.256231
Name: pv_measurement, Length: 2160, dtype: float32

In [91]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission = sample_submission[['id']].merge(preds[['id', 'prediction']], on='id', how='left')
sample_submission.to_csv('autogluon_oscar.csv', index=False)