<div align="center">
    <h1><b>1. Priprema podataka</b></h1>
</div>


In [15]:
library(rhdf5)
library(tidyr)
library(dplyr)
library(arrow)

In [16]:
RAW_DATASET_PATH <- "../data/nsrdb_puerto_rico_2017.h5"
TRANSFORMED_DATASET_PATH <- "../data/nsrdb_puerto_rico_2017_transformed.parquet"

In [17]:
datasets <- h5ls(RAW_DATASET_PATH)
datasets

Unnamed: 0_level_0,group,name,otype,dclass,dim
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
0,/,air_temperature,H5I_DATASET,INTEGER,2480 x 105120
1,/,clearsky_dhi,H5I_DATASET,INTEGER,2480 x 105120
2,/,clearsky_dni,H5I_DATASET,INTEGER,2480 x 105120
3,/,clearsky_ghi,H5I_DATASET,INTEGER,2480 x 105120
4,/,coordinates,H5I_DATASET,FLOAT,2 x 2480
5,/,dhi,H5I_DATASET,INTEGER,2480 x 105120
6,/,dni,H5I_DATASET,INTEGER,2480 x 105120
7,/,ghi,H5I_DATASET,INTEGER,2480 x 105120
8,/,meta,H5I_DATASET,COMPOUND,2480
9,/,solar_zenith_angle,H5I_DATASET,INTEGER,2480 x 105120


<br>
<br>
<div align="center">
    <h3> <b> Ispitaivanje atributa obilježja skupa podataka <b> </h3>
</div>

---
<br>

In [5]:
# Helper function to convert attributes to a data.frame
attrs_to_df <- function(attr_list, dataset_name) {
  data.frame(
    Dataset = dataset_name,
    Attribute = names(attr_list),
    Value = as.character(unlist(attr_list)),
    stringsAsFactors = FALSE
  )
}

In [6]:
air_temperature_attributes <- h5readAttributes(RAW_DATASET_PATH, "/air_temperature")
clearsky_dhi_attributes <- h5readAttributes(RAW_DATASET_PATH, "/clearsky_dhi")
clearsky_dni_attributes <- h5readAttributes(RAW_DATASET_PATH, "/clearsky_dni")
clearsky_ghi_attributes <- h5readAttributes(RAW_DATASET_PATH, "/clearsky_ghi")
coordinates_attributes <- h5readAttributes(RAW_DATASET_PATH, "/coordinates")
dhi_attributes <- h5readAttributes(RAW_DATASET_PATH, "/dhi")
dni_attributes <- h5readAttributes(RAW_DATASET_PATH, "/dni")
ghi_attributes <- h5readAttributes(RAW_DATASET_PATH, "/ghi")
solar_zenith_angle_attributes <- h5readAttributes(RAW_DATASET_PATH, "/solar_zenith_angle")
surface_albedo_attributes <- h5readAttributes(RAW_DATASET_PATH, "/surface_albedo")
surface_pressure_attributes <- h5readAttributes(RAW_DATASET_PATH, "/surface_pressure")
time_index_attributes <- h5readAttributes(RAW_DATASET_PATH, "/time_index")
total_precipitable_water_attributes <- h5readAttributes(RAW_DATASET_PATH, "/total_precipitable_water")
wind_speed_attributes <- h5readAttributes(RAW_DATASET_PATH, "/wind_speed")

air_temperature_attributes_df <- attrs_to_df(air_temperature_attributes, "air_temperature")

air_temperature_attributes <- attrs_to_df(air_temperature_attributes, "air_temperature")
clearsky_dhi_attributes <- attrs_to_df(clearsky_dhi_attributes, "clearsky_dhi")
clearsky_dni_attributes <- attrs_to_df(clearsky_dni_attributes, "clearsky_dni")
clearsky_ghi_attributes <- attrs_to_df(clearsky_ghi_attributes, "clearsky_ghi")
coordinates_attributes <- attrs_to_df(coordinates_attributes, "coordinates")
dhi_attributes <- attrs_to_df(dhi_attributes, "dhi")
dni_attributes <- attrs_to_df(dni_attributes, "dni")
ghi_attributes <- attrs_to_df(ghi_attributes, "ghi")
solar_zenith_angle_attributes <- attrs_to_df(solar_zenith_angle_attributes, "solar_zenith_angle")
surface_albedo_attributes <- attrs_to_df(surface_albedo_attributes, "surface_albedo")
surface_pressure_attributes <- attrs_to_df(surface_pressure_attributes, "surface_pressure")
time_index_attributes <- attrs_to_df(time_index_attributes, "time_index")
total_precipitable_water_attributes <- attrs_to_df(total_precipitable_water_attributes, "total_precipitable_water")
wind_speed_attributes <- attrs_to_df(wind_speed_attributes, "wind_speed")


all_attrs_df <- rbind(air_temperature_attributes, clearsky_dhi_attributes, clearsky_dni_attributes, clearsky_ghi_attributes, coordinates_attributes, dhi_attributes, dni_attributes, ghi_attributes, solar_zenith_angle_attributes, surface_albedo_attributes, surface_pressure_attributes, time_index_attributes, total_precipitable_water_attributes, wind_speed_attributes)

all_attrs_df

Dataset,Attribute,Value
<chr>,<chr>,<chr>
air_temperature,psm_scale_factor,1
air_temperature,units,Celsius
clearsky_dhi,psm_scale_factor,1
clearsky_dhi,units,W/m2
clearsky_dni,psm_scale_factor,1
clearsky_dni,units,W/m2
clearsky_ghi,psm_scale_factor,1
clearsky_ghi,units,W/m2
coordinates,description,"(latitude, longitude)"
dhi,psm_scale_factor,1


<br>
<br>


<div align="center">
    <h3> <b> O skupu podataka </b> </h3>
</div>


---

<br>

Za svako obilježja skupa podataka, <i> Tabela 1 </i> (izvor: <a href="https://nsrdb.nrel.gov/data-sets/us-data"> nsrdb.nrel.gov/data-sets</a>) prikazuje: naziv obilježja, tip, mjernu jedinicu i značenje:

<div align="center">

<br>

| Naziv obilježja              | Tip       | Mjerna jedinica            | Značenje                                      |
|:----------------------------:|:--------:|:-------------------------------:|:--------------------------------------------:|
| air_temperature               | INTEGER  | Celsius                        | Temperatura vazduha na lokaciji u datom trenutku |
| clearsky_dhi                  | INTEGER  | W/m²                           | Modelovana maksimalna difuzna horizontalna iradjansa na površini pod pretpostavkom vedrog neba|
| clearsky_dni                  | INTEGER  | W/m²                           | Modelovana maksimalna direktna normalna iradijansa na površini pod pretpostavkom vedrog neba|
| clearsky_ghi                  | INTEGER  | W/m²                           | Modelovana maksimalna globalna horizontalna iradijansa na površini pod pretpostavkom vedrog neba |
| coordinates                   | FLOAT    | (decimal, decimal)          | Geografske koordinate(lat, long)       |
| dhi                           | INTEGER  | W/m²                           | Difuzna horizontalna iradijansa              |
| dni                           | INTEGER  | W/m²                           | Direktna normalna iradijansa                 |
| ghi                           | INTEGER  | W/m²                           | Globalna horizontalna iradijansa            |
| solar_zenith_angle            | INTEGER  | degrees                    | Ugao sunca u odnosu na vertikalu |
| surface_albedo                | INTEGER  | -                 | Reflektivnost površine                         |
| surface_pressure              | INTEGER  | mbar              | Atmosferski pritisak                           |
| time_index                    | STRING   | UTC / GMT                       | Vremenski indeks mjerenja                       |
| total_precipitable_water      | INTEGER  | mm                   | Ukupna količina vodene pare u atmosferi       |
| wind_speed                    | INTEGER  | m/s                     | Brzina vjetra na lokaciji                        |

<i> Tabela 1 </i>

</div>

<br>
<br>


<div align="center">
    <h3> <b> Strukture podataka obilježja </b> </h3>
</div>

---

<br>


- Obilježje **`coordinates`** ima dimenzije **2 × 2480**, što odgovara geografskim lokacijama svake mjerne stanice (senzora).
- Obilježje **`time_index`** ima dimenzije **1 × 105120** i predstavlja vremenske trenutke mjerenja.  
- **Senzorska obilježja** imaju dimenzije **2480 × 105120**.  

Struktura senzorskih obilježja(<i>Matrica 1</i>) može se posmatrati kao matrica gdje:  
- vrste (redovi) označavaju senzore / njihove koordinate,  
- kolone označavaju vremenske trenutke mjerenja.  

<div align="center">

<br>

$$
\begin{matrix}
 & time\ index\ 1 & time\ index\ 2 & \dots & time\ index\ 105120 \\
senzor\ 1 & \\
senzor\ 2 & \\
\dots \\
senzor\ 2480
\end{matrix}
$$

<i> Matrica 1 </i>
</div>

<br>
<br>

Cilj je transformisati skup podataka u <i> **long/tidy format** </i>, gdje svaka vrsta predstavlja jednu kombinaciju **uređaj × vrijeme × senzorska obilježja**

<br>
<br>


<div align="center">
    <h3> <b> Cilj transformacije - Zeljena struktura podataka obilježja </b> </h3>
</div>

---

Skup podataka je potrebno transformisati u **long/tidy format**, gdje svaka vrsta predstavlja jednu kombinaciju:

**`uređaj × vrijeme × senzorska obilježja`**

Dimenzije transformisanog skupa podataka biće:  
- **Broj redova:** `2480 × 105120 = 260,697,600`  
- **Broj kolona:** `16` (`device_id, lat, lon, time_index, air_temperature, clearsky_dhi, clearsky_dni, clearsky_ghi, dhi, dni, ghi, solar_zenith_angle, surface_albedo, surface_pressure, total_precipitable_water, wind_speed`)  

<br>
<br>

> **Napomena**  
> - *Long/tidy format* je struktura u kojoj svaka kolona predstavlja jednu varijablu, a svaki red jednu opservaciju.  
> - *Senzorska obilježja* uključuju:  
>   `air_temperature, clearsky_dhi, clearsky_dni, clearsky_ghi, dhi, dni, ghi, solar_zenith_angle, surface_albedo, surface_pressure, total_precipitable_water, wind_speed`.  


<br>
<br>


<div align="center">
    <h3> Ucitavanje podataka i transformacija u <i> long/tidy format </b> </h3>
</div>

---


In [7]:
coords <- h5read(RAW_DATASET_PATH, "coordinates")

sensor_coords_df <- data.frame(
  device_id = 1:ncol(coords),
  lat = coords[1, ],
  lon = coords[2, ]
)

# Transform time_index to POSIXct timestamp
time_index_vec <- as.vector(h5read(RAW_DATASET_PATH, "time_index"))
timestamp_vec <- as.POSIXct(time_index_vec, origin = "1970-01-01", tz = "UTC")
air_temperature_vec <- as.vector(h5read(RAW_DATASET_PATH, "air_temperature"))
clearsky_dhi_vec <- as.vector(h5read(RAW_DATASET_PATH, "clearsky_dhi"))
clearsky_dni_vec <- as.vector(h5read(RAW_DATASET_PATH, "clearsky_dni"))
clearsky_ghi_vec <- as.vector(h5read(RAW_DATASET_PATH, "clearsky_ghi"))
dhi_vec <- as.vector(h5read(RAW_DATASET_PATH, "dhi"))
dni_vec <- as.vector(h5read(RAW_DATASET_PATH, "dni"))
ghi_vec <- as.vector(h5read(RAW_DATASET_PATH, "ghi"))
solar_zenith_angle_vec <- as.vector(h5read(RAW_DATASET_PATH, "solar_zenith_angle"))
surface_albedo_vec <- as.vector(h5read(RAW_DATASET_PATH, "surface_albedo"))
surface_pressure_vec <- as.vector(h5read(RAW_DATASET_PATH, "surface_pressure"))
total_precipitable_water_vec <- as.vector(h5read(RAW_DATASET_PATH, "total_precipitable_water"))
wind_speed_vec <- as.vector(h5read(RAW_DATASET_PATH, "wind_speed"))

In [8]:
# long/tidy format
nsrdb_lt_df <- data.frame(
  device_id = rep(sensor_coords_df$device_id, times = length(time_index_vec)),
  lat       = rep(sensor_coords_df$lat,       times = length(time_index_vec)),
  lon       = rep(sensor_coords_df$lon,       times = length(time_index_vec)),
  time_index = rep(time_index_vec, each = nrow(sensor_coords_df)),
  air_temperature = air_temperature_vec,
  clearsky_dhi = clearsky_dhi_vec,
  clearsky_dni = clearsky_dni_vec,
  clearsky_ghi = clearsky_ghi_vec,
  dhi = dhi_vec,
  dni = dni_vec,
  ghi = ghi_vec,
  solar_zenith_angle = solar_zenith_angle_vec,
  surface_albedo = surface_albedo_vec,
  surface_pressure = surface_pressure_vec,
  total_precipitable_water = total_precipitable_water_vec,
  wind_speed = wind_speed_vec
)

nsrdb_lt_df

device_id,lat,lon,time_index,air_temperature,clearsky_dhi,clearsky_dni,clearsky_ghi,dhi,dni,ghi,solar_zenith_angle,surface_albedo,surface_pressure,total_precipitable_water,wind_speed
<int>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,18.12,-67.93,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,123,10100,4082,64
2,18.10,-67.93,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,124,10100,4070,64
3,18.08,-67.93,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,124,10100,4112,64
4,18.06,-67.93,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,123,10200,4162,64
5,18.12,-67.91,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,123,10100,4057,64
6,18.10,-67.91,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,126,10100,4045,64
7,18.08,-67.91,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,126,10100,4070,64
8,18.06,-67.91,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,123,10100,4134,64
9,18.12,-67.89,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,121,10100,4077,64
10,18.10,-67.89,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,124,10100,4063,64


<br>
<br>

<div align="center">
    <h3><b>Provjera</b></h3>
</div>

---

Prilikom provjere rasporeda senzora po indeksima, očekujemo sljedeće vrijednosti:

- Na poziciji **1** → `device_id = 1`
- Na poziciji **122** → `device_id = 122`
- Na poziciji **2480** → `device_id = 2480`
- Na poziciji **2481** → `device_id = 1`
- Na poziciji **12405** → `device_id = 5`

In [9]:
nsrdb_lt_df[c(1, 122, 2480, 2481, 12405), ]

Unnamed: 0_level_0,device_id,lat,lon,time_index,air_temperature,clearsky_dhi,clearsky_dni,clearsky_ghi,dhi,dni,ghi,solar_zenith_angle,surface_albedo,surface_pressure,total_precipitable_water,wind_speed
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,1,18.12,-67.93,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,123,10100,4082,64
122,122,18.3,-67.15,2017-01-01 00:00:00+00:00,25,0,0,0,0,0,0,8900,154,10200,4359,48
2480,2480,18.7,-64.27,2017-01-01 00:00:00+00:00,26,0,0,0,0,0,0,8900,92,10200,3868,77
2481,1,18.12,-67.93,2017-01-01 00:05:00+00:00,26,0,0,0,0,0,0,8900,123,10100,4093,64
12405,5,18.12,-67.91,2017-01-01 00:25:00+00:00,26,0,0,0,0,0,0,8900,123,10100,4110,65


<br>
<br>

<div align="center">
    <h3><b>Ispitivanje prisustva nedostajućih vrednosti</b></h3>
</div>

---

In [10]:
# Count total NA values in the entire data frame
total_na_values <- sum(is.na(nsrdb_lt_df))

# Count NA values per column
na_values_per_column <- colSums(is.na(nsrdb_lt_df))

na_summary <- data.frame(
  column = names(nsrdb_lt_df),
  na_count = na_values_per_column
)

cat("Count total NA values:", total_na_values, "\n\n")
na_summary

Count total NA values: 0 



Unnamed: 0_level_0,column,na_count
Unnamed: 0_level_1,<chr>,<dbl>
device_id,device_id,0
lat,lat,0
lon,lon,0
time_index,time_index,0
air_temperature,air_temperature,0
clearsky_dhi,clearsky_dhi,0
clearsky_dni,clearsky_dni,0
clearsky_ghi,clearsky_ghi,0
dhi,dhi,0
dni,dni,0


<br>
<br>

---

<div align="center">
    <h3><b>Transformacija podataka</b></h3>
</div>

Za klasifikaciju potrebno je izračunati [Indeks čistoće solarne radijacije (eng. *Solar clearness index*)](https://en.wikipedia.org/wiki/Clearness_index), koji se računa kao:

$$
K_t = \frac{\text{GHI}}{\text{GHI}_\text{clear sky}}
$$

Ova formula daje odnos između izmjerene iradijanse i modelovane maksimalne iradijanse na površini pod pretpostavkom vedrog neba.

Noćni sati se filtriraju, tj. pojave gdje je $\text{GHI} = 0$.


In [None]:
nsrdb_lt_df_day <- nsrdb_lt_df %>%
  filter(ghi > 0) %>%
  mutate(sci = ghi / clearsky_ghi) %>%
  write_parquet(TRANSFORMED_DATASET_PATH)

nsrdb_lt_df_day.head

device_id,lat,lon,time_index,air_temperature,clearsky_dhi,clearsky_dni,clearsky_ghi,dhi,dni,ghi,solar_zenith_angle,surface_albedo,surface_pressure,total_precipitable_water,wind_speed,sci
<int>,<dbl>,<dbl>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>
2268,17.70,-64.89,2017-01-01 10:55:00+00:00,26,11,25,11,11,25,11,8900,151,10200,3604,74,1
2269,17.68,-64.89,2017-01-01 10:55:00+00:00,26,11,25,11,11,25,11,8899,147,10200,3614,74,1
2270,17.66,-64.89,2017-01-01 10:55:00+00:00,26,11,25,11,11,25,11,8898,149,10200,3621,74,1
2276,17.74,-64.87,2017-01-01 10:55:00+00:00,26,11,29,11,11,29,11,8900,170,10100,3432,74,1
2277,17.72,-64.87,2017-01-01 10:55:00+00:00,26,11,27,11,11,27,11,8899,166,10200,3549,74,1
2278,17.70,-64.87,2017-01-01 10:55:00+00:00,26,11,26,11,11,26,11,8898,143,10200,3577,74,1
2279,17.68,-64.87,2017-01-01 10:55:00+00:00,26,11,26,11,11,26,11,8898,139,10200,3606,74,1
2280,17.66,-64.87,2017-01-01 10:55:00+00:00,26,11,26,11,11,26,11,8897,146,10200,3621,74,1
2285,17.76,-64.85,2017-01-01 10:55:00+00:00,26,11,29,11,11,29,11,8899,161,10100,3434,74,1
2286,17.74,-64.85,2017-01-01 10:55:00+00:00,26,11,31,12,11,31,12,8898,170,10000,3402,74,1


In [12]:
nsrdb_lt_df_day %>%
  group_by(time_index) %>%
  summarise(count = n())

time_index,count
<chr>,<int>
2017-01-01 10:55:00+00:00,132
2017-01-01 11:00:00+00:00,584
2017-01-01 11:05:00+00:00,2337
2017-01-01 11:10:00+00:00,2480
2017-01-01 11:15:00+00:00,2480
2017-01-01 11:20:00+00:00,2480
2017-01-01 11:25:00+00:00,2480
2017-01-01 11:30:00+00:00,2480
2017-01-01 11:35:00+00:00,2480
2017-01-01 11:40:00+00:00,2480


<br>
<br>

---

- **Broj vremenskih trenutaka(mjerenja) sa Suncem (barem na jednoj lokaciji):** 53.224  
- **Broj suncanih minuta u Portoriku za 2017. godinu:**  
  53.224 × 5 min = 266.120 minuta = 4.435 h ≈ 185 dana  
- **Broj minuta bez sunca u Portoriku za 2017. godinu:**  
  (105.120 − 53.224) × 5 = 259.480 min = 4.324 h ≈ 180 dana  
