### Import libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

### Data cleaning and preprocessing

In [None]:
#Read dataset
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')
X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')
X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

#add location to each sample
train_a["location"] = "A"
train_b["location"] = "B"
train_c["location"] = "C"
X_train_estimated_a["location"] = "A"
X_train_estimated_b["location"] = "B"
X_train_estimated_c["location"] = "C"
X_train_observed_a["location"] = "A"
X_train_observed_b["location"] = "B"
X_train_observed_c["location"] = "C"

#remove extra minute 00 sample
X_train_observed_a = X_train_observed_a.iloc[:-1,:]
X_train_observed_b = X_train_observed_b.iloc[:-1,:]
X_train_observed_c = X_train_observed_c.iloc[:-1,:]

#add date_calc column same as date_forecast column to observed data
X_train_observed_a.insert(0, "date_calc", X_train_observed_a["date_forecast"])
X_train_observed_b.insert(0, "date_calc", X_train_observed_b["date_forecast"])
X_train_observed_c.insert(0, "date_calc", X_train_observed_c["date_forecast"])

#concat all the samples and remove date_calc column
X_train_raw = pd.concat([X_train_observed_a,
                     X_train_observed_b,
                     X_train_observed_c,
                     X_train_estimated_a,
                     X_train_estimated_b,
                     X_train_estimated_c])

#feature indicating time between date_calc and date_forecast
X_train_raw["calc_time"] =(X_train_raw["date_forecast"] - X_train_raw["date_calc"]).astype('timedelta64[s]')

#fill nans
X_train_raw["snow_density:kgm3"] = X_train_raw["snow_density:kgm3"].apply(
    lambda a : np.isnan(a)
    ).map({True: 0, False: 1})
X_train_raw["ceiling_height_agl:m"] = X_train_raw["ceiling_height_agl:m"].apply(
    lambda a : -1000 if np.isnan(a) else a
)
X_train_raw["cloud_base_agl:m"] = X_train_raw["ceiling_height_agl:m"].apply(
    lambda a : -1000 if np.isnan(a) else a
)

#create seperate dataframes for measurments at minute 00, 15, 30 and 45
X_train00 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 0)].reset_index().iloc[:,1:]
X_train15 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 15)].reset_index().iloc[:,1:]
X_train30 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 30)].reset_index().iloc[:,1:]
X_train45 = X_train_raw[X_train_raw["date_forecast"].apply(lambda time: time.minute == 45)].reset_index().iloc[:,1:]

#remove redundant data
X_train15 = X_train15.iloc[:,2:-2]
X_train30 = X_train30.iloc[:,2:-2]
X_train45 = X_train45.iloc[:,2:-2]

#join observations into single sample
X_train = X_train00.join(X_train15, lsuffix="_00", rsuffix="_15").join(X_train30.join(X_train45, lsuffix="_30", rsuffix="_45"))

#rename column for merging with targets
X_train = X_train.rename(columns={"date_forecast" : "time"})

#concat target values and drop NaN values
targets = pd.concat([train_a,
                     train_b,
                     train_c]).dropna()

#merge weatherfeatures with corresponding target pv measurement
dataset = pd.merge(X_train, targets, how="right", on=["time", "location"])

#shuffle dataset
dataset = dataset.sample(frac=1, random_state=43).reset_index().iloc[:,1:]

#split into features and targets
datasetX = dataset.iloc[:, :-1]
datasetY = dataset.iloc[:, -1:]

#add day_of_year and hour feature columns
datasetX["day"] = datasetX["time"].dt.day_of_year
datasetX["hour"] = datasetX["time"].dt.hour

#get indexes of samples in the months of the test dataset
indexMayJuneJuly = datasetX["time"].apply(lambda time : time.month in [5, 6, 7])

#OHE encoding for catagorical feature "location"
datasetX["location_A"] = datasetX["location"].apply(lambda a : a == "A").map({True: 1, False: 0})
datasetX["location_B"] = datasetX["location"].apply(lambda a : a == "B").map({True: 1, False: 0})
datasetX["location_C"] = datasetX["location"].apply(lambda a : a == "C").map({True: 1, False: 0})

datasetX = datasetX.drop("location", axis=1)

#move datecalc column
#date_calc_column = datasetX.pop("date_calc")
#datasetX.insert(183, "is_not_calculated", date_calc_column) #!constant 183 can be source of bugs

#fix nans for some reason???
#datasetX["is_not_calculated"] = datasetX["is_not_calculated"].fillna(method="ffill")

#map location labels to numbers
#datasetX["location"] = datasetX["location"].map({"A": 0, "B": 1, "C": 2})

#drop time and date_calc columns
datasetX = datasetX.iloc[:,2:]

#calculate mean and std for normalizing data, values should also be used for normalizing test data
dataMean = datasetX.mean()
dataStd = datasetX.std()

#normalize data
datasetX.iloc[:,:-4] = ((datasetX.iloc[:,:-4]-dataMean[:-4])/dataStd[:-4]).fillna(value=0)


#partition into training and evalset
trainsetX = datasetX.iloc[:85000,:]
trainsetY = datasetY.iloc[:85000,:]
trainsetIndexMayJuneJuly = indexMayJuneJuly[:85000]
evalsetX = datasetX.iloc[85000:,:]
evalsetY = datasetY.iloc[85000:,:]
evalsetIndexMayJuneJuly = indexMayJuneJuly[85000:]

display(datasetX)