### import libraries

In [7]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf
import random
import sklearn
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

### Clean data

In [8]:
#Read dataset
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

#add location to each sample
train_a["location"] = "A"
train_b["location"] = "B"
train_c["location"] = "C"

X_train_estimated_a["location"] = "A"
X_train_estimated_b["location"] = "B"
X_train_estimated_c["location"] = "C"

X_train_observed_a["location"] = "A"
X_train_observed_b["location"] = "B"
X_train_observed_c["location"] = "C"

#remove extra minute 00 sample
X_train_observed_a = X_train_observed_a.iloc[:-1,:]
X_train_observed_b = X_train_observed_b.iloc[:-1,:]
X_train_observed_c = X_train_observed_c.iloc[:-1,:]

#add date_calc column same as date_forecast column to observed data
X_train_observed_a.insert(0, "date_calc", X_train_observed_a["date_forecast"])
X_train_observed_b.insert(0, "date_calc", X_train_observed_b["date_forecast"])
X_train_observed_c.insert(0, "date_calc", X_train_observed_c["date_forecast"])

#concat all the samples
X_train_raw = pd.concat([X_train_observed_a,
                     X_train_observed_b,
                     X_train_observed_c,
                     X_train_estimated_a,
                     X_train_estimated_b,
                     X_train_estimated_c])

#feature indicating time between date_calc and date_forecast
X_train_raw["calc_time"] =(X_train_raw["date_forecast"] - X_train_raw["date_calc"]).astype('timedelta64[s]')

#fill nans
X_train_raw["snow_density:kgm3"] = X_train_raw["snow_density:kgm3"].apply(
    lambda a : np.isnan(a)
    ).map({True: 0, False: 1})
X_train_raw["ceiling_height_agl:m"] = X_train_raw["ceiling_height_agl:m"].apply(
    lambda a : -1000 if np.isnan(a) else a
)
X_train_raw["cloud_base_agl:m"] = X_train_raw["ceiling_height_agl:m"].apply(
    lambda a : -1000 if np.isnan(a) else a
)

#create seperate dataframes for measurments at minute 00, 15, 30 and 45
X_train00 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 0)].reset_index().iloc[:,1:]
X_train15 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 15)].reset_index().iloc[:,1:]
X_train30 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 30)].reset_index().iloc[:,1:]
X_train45 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 45)].reset_index().iloc[:,1:]

#remove redundant data
X_train15 = X_train15.iloc[:,2:-2]
X_train30 = X_train30.iloc[:,2:-2]
X_train45 = X_train45.iloc[:,2:-2]

#join observations into single sample
X_train = X_train00.join(X_train15, lsuffix="_00", rsuffix="_15").join(X_train30.join(X_train45, lsuffix="_30", rsuffix="_45"))

#rename column for merging with targets
X_train = X_train.rename(columns={"date_forecast" : "time"})

#concat target values and drop NaN values
targets = pd.concat([train_a,
                     train_b,
                     train_c]).dropna()

#merge weatherfeatures with corresponding target pv measurement
dataset = pd.merge(X_train, targets, how="right", on=["time", "location"])

#shuffle dataset
dataset = dataset.sample(frac=1, random_state=43).reset_index().iloc[:,1:]

#split into features and targets
datasetX = dataset.iloc[:, :-1]
datasetY = dataset.iloc[:, -1:]

#add day and hour feature columns
datasetX["day"] = datasetX["time"].dt.day_of_year
datasetX["hour"] = datasetX["time"].dt.hour

#get indexes of samples in the months of the test dataset
indexMayJuneJuly = datasetX["time"].apply(lambda time : time.month in [5, 6, 7])

#OHE encoding for catagorical feature "location"

# Do not include the data because it could overfit the model
"""
datasetX["location_A"] = datasetX["location"].apply(lambda a : a == "A").map({True: 1, False: 0})
datasetX["location_B"] = datasetX["location"].apply(lambda a : a == "B").map({True: 1, False: 0})
datasetX["location_C"] = datasetX["location"].apply(lambda a : a == "C").map({True: 1, False: 0})
"""

# Therefore also drop location column
datasetX = datasetX.drop("location", axis=1)

#drop time and date_calc columns
datasetX = datasetX.iloc[:,2:]

#calculate mean and std for normalizing data, values should also be used for normalizing test data
dataMean = datasetX.mean()
dataStd = datasetX.std()

#normalize data
datasetX.iloc[:,:-4] = ((datasetX.iloc[:,:-4]-dataMean[:-4])/dataStd[:-4]).fillna(value=0)

#partition into training and evalset
trainsetX = datasetX.iloc[:85000,:]
trainsetY = datasetY.iloc[:85000,:]
trainsetIndexMayJuneJuly = indexMayJuneJuly[:85000]
evalsetX = datasetX.iloc[85000:,:]
evalsetY = datasetY.iloc[85000:,:]
evalsetIndexMayJuneJuly = indexMayJuneJuly[85000:]

#display(datasetX)

  datasetX.iloc[:,:-4] = ((datasetX.iloc[:,:-4]-dataMean[:-4])/dataStd[:-4]).fillna(value=0)


### Remove highly correlated features

In [11]:
# Create correlation matrix
corr_matrix = trainsetX.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.75
to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]

trainsetX = trainsetX.drop(to_drop, axis=1)
evalsetX = evalsetX.drop(to_drop, axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85000 entries, 0 to 84999
Data columns (total 35 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   absolute_humidity_2m:gm3_00        85000 non-null  float32
 1   ceiling_height_agl:m_00            85000 non-null  float64
 2   clear_sky_energy_1h:J_00           85000 non-null  float32
 3   dew_or_rime:idx_00                 85000 non-null  float32
 4   direct_rad:W_00                    85000 non-null  float32
 5   effective_cloud_cover:p_00         85000 non-null  float32
 6   elevation:m_00                     85000 non-null  float32
 7   fresh_snow_12h:cm_00               85000 non-null  float32
 8   fresh_snow_1h:cm_00                85000 non-null  float32
 9   is_day:idx_00                      85000 non-null  float32
 10  msl_pressure:hPa_00                85000 non-null  float32
 11  precip_5min:mm_00                  85000 non-null  flo

### Make Comparison func

In [19]:
def CompareSplits(df, num_splits=5):

    # Calculate the size of each split
    split_size = len(df) // num_splits

    # Initialize an empty list to store the comparison results
    comparison_results = []

    # Iterate through the splits
    for i in range(1, num_splits + 1):  # Include the last split
        # Calculate the start and end indices for the current split
        start_idx = (i - 1) * split_size
        end_idx = i * split_size if i < num_splits else len(df)  # Handle the last split

        # Split the DataFrame into the current segment
        segment = df[start_idx:end_idx]

        # Calculate the mean value of all features in the current segment
        mean_values = segment.mean()

        # Store the mean values in a dictionary
        result = {
            'Split': f'{i}/{num_splits}',  # Indicate the split number
            **mean_values.to_dict()  # Include mean values of all features
        }

        # Append the result to the list of comparison results
        comparison_results.append(result)

    # Convert the list of comparison results to a DataFrame
    comparison_df = pd.DataFrame(comparison_results)

    # Display or analyze the comparison results as needed
    display(comparison_df)

### Compare splits of the X data

In [20]:
CompareSplits(trainsetX, num_splits=10)

Unnamed: 0,Split,absolute_humidity_2m:gm3_00,ceiling_height_agl:m_00,clear_sky_energy_1h:J_00,dew_or_rime:idx_00,direct_rad:W_00,effective_cloud_cover:p_00,elevation:m_00,fresh_snow_12h:cm_00,fresh_snow_1h:cm_00,is_day:idx_00,msl_pressure:hPa_00,precip_5min:mm_00,precip_type_5min:idx_00,prob_rime:p_00,rain_water:kgm2_00,relative_humidity_1000hPa:p_00,snow_density:kgm3_00,snow_depth:cm_00,snow_drift:idx_00,snow_melt_10min:mm_00,snow_water:kgm2_00,sun_azimuth:d_00,super_cooled_liquid_water:kgm2_00,visibility:m_00,wind_speed_10m:ms_00,wind_speed_u_10m:ms_00,wind_speed_v_10m:ms_00,wind_speed_w_1000hPa:ms_00,calc_time,precip_5min:mm_15,precip_type_5min:idx_15,snow_drift:idx_15,precip_type_5min:idx_30,precip_type_5min:idx_45,day
0,1/10,0.002165,0.012126,-0.013779,0.006378,-0.004402,-0.003266,-0.002747,-0.012274,-0.026083,-0.013655,0.011907,0.004158,-0.016851,0.001448,0.003319,-0.002434,-0.004296,0.000265,-0.004635,0.001538,-0.01304,0.005141,-0.012058,0.006832,-0.005251,-0.008342,0.013946,-0.006567,0.010479,-0.012207,-0.025063,-0.004635,-0.011744,-0.01578,177.323647
1,2/10,-0.019121,0.010753,-0.0133,-0.008935,-0.00354,0.007455,0.00199,0.021416,0.008137,-0.004232,-0.005579,0.023542,0.025594,-0.012249,0.0071,-0.000167,0.016099,0.02555,0.020727,0.011885,0.023857,0.000894,0.017689,-0.004963,-0.001677,-0.013524,0.001321,-0.001138,0.001306,0.021582,0.025658,-0.004636,0.019867,0.029368,177.385059
2,3/10,0.015483,-0.002861,0.008671,-0.007971,0.022147,-0.017505,0.007214,-0.010509,-0.004057,0.02071,-0.002214,-0.002375,-0.004928,-0.001472,0.001402,-0.011535,-0.003728,-0.004293,-0.004635,0.023353,0.005708,-0.007268,-0.000343,0.011128,-0.006422,-0.00159,-0.004622,0.006098,-0.011677,0.008702,0.005035,-0.004635,0.014385,0.010861,174.104941
3,4/10,-0.00066,0.001541,-0.006264,-0.004158,-0.012565,0.01547,-0.019767,-0.002721,0.001123,-0.010815,-0.001641,-0.024197,-0.018176,-0.002397,-0.005003,0.014379,0.009233,0.001224,-0.004637,-0.006673,-0.013444,-0.003912,0.000999,-0.01055,0.005257,0.003166,-0.008872,0.000672,-0.001605,-0.016053,-0.019997,-0.004637,-0.011231,-0.020674,176.446
4,5/10,0.001831,-0.010096,-0.002514,-0.008928,-0.000564,0.001206,-0.00899,-0.008307,0.015882,-0.017656,0.024695,0.018067,0.022282,0.00967,0.014822,0.004171,0.005359,0.007158,-0.004635,0.001265,0.02516,0.01938,0.016859,-0.018328,0.013999,0.009346,-0.00598,0.006098,0.009426,0.01701,0.02054,-0.004635,0.018073,0.008766,176.844118
5,6/10,0.011244,0.004757,-0.003694,0.003508,-0.008593,0.001189,-0.001433,-0.016617,-0.009381,0.005646,-0.009628,-0.004412,-0.008291,-0.001173,0.011535,-0.00384,0.001383,0.002366,-0.004635,-0.007461,-0.004228,0.004159,-0.004881,0.006411,-0.007944,-0.01754,-0.001751,0.00248,-0.007105,-0.012876,0.000779,-0.004635,-0.011744,-0.019073,175.446588
6,7/10,-0.005165,-0.018962,0.006347,0.010202,0.002032,0.006185,0.010568,0.00709,0.011592,-0.012356,-0.011153,-0.011316,-6.2e-05,0.001345,-0.021355,-0.007995,-0.012273,-0.020319,-0.004635,-0.00965,0.000416,-0.001193,0.001602,0.005935,0.004041,-0.001258,0.001686,-0.011994,0.013865,-0.016803,-0.007455,-0.004635,0.002371,0.010835,175.818588
7,8/10,0.008484,-0.011853,0.003869,0.010687,0.010002,-0.016615,0.002097,0.008181,-0.011354,0.000346,0.013816,0.011062,-0.010099,-0.00039,-0.007061,0.000827,-0.032668,-0.014065,-0.004634,-0.006363,-0.002264,-0.005052,-0.005771,0.003271,-0.000949,0.003677,0.003935,-0.006567,-0.012325,0.021465,-0.008924,-0.004634,-0.018481,-0.011264,176.277882
8,9/10,-0.00676,0.003797,0.001172,-0.013236,-0.005821,0.005096,0.004116,0.003802,0.001571,0.010004,-0.010413,0.017942,0.025313,0.01543,0.009044,0.006682,0.011581,0.017175,0.020728,-0.009923,-0.003568,-0.013021,0.002024,-0.001624,0.010563,0.010545,0.00787,0.024192,-0.00252,0.015512,0.00653,0.04609,0.007289,0.001855,175.277294
9,10/10,-0.001296,-0.011264,0.011988,0.026,-0.002637,-0.008729,-0.004462,-0.001618,0.010902,0.010224,0.003211,-0.006144,-0.006991,-0.020464,-0.016046,0.00044,-0.00138,-0.010493,-0.004633,-0.007166,-0.011882,0.005347,-0.008817,0.004801,-0.016133,0.002769,-0.017426,-0.006568,-0.001603,-0.010001,0.005416,-0.004633,-0.007056,0.002557,175.845765


### Compare splits of the Y data

In [21]:
CompareSplits(trainsetY, num_splits=10)

Unnamed: 0,Split,pv_measurement
0,1/10,276.833559
1,2/10,278.531293
2,3/10,302.446326
3,4/10,278.821102
4,5/10,281.146568
5,6/10,281.156599
6,7/10,294.722238
7,8/10,287.685722
8,9/10,291.939122
9,10/10,294.345936


### Conclusion

There is not a significant deviation in the data in the last split compared to the other splits