# Preprocessing Real data

Reason for fetching real data is that CERRA data is quite similar among units and different from Statkraft. Double check with real data whether there are no bugs in getting the data

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import xarray as xr

import datetime
import pandas as pd
from shapely.geometry import Point

import xarray as xr
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pyproj
from pyproj import CRS, Transformer
from scipy.spatial import cKDTree


import dask

import os


In [None]:
# Load the shapefile
shp_file = "data/raw/catchment_statkraft/catchment.shp"
catchment = gpd.read_file(shp_file)
# Check the coordinate reference system (CRS)
print(catchment.crs)
# Plot the shapefile
catchment.plot();

In [None]:
# Load the shapefile
shp_file_norway = "data/raw/other_data/norwayshapefiles/Kommune_FLATE.shp"
catchment_norway = gpd.read_file(shp_file_norway)
catchment_norway.columns = catchment_norway.columns.str.lower()
catchment_norway.columns = catchment_norway.columns.str.strip()
# Check the coordinate reference system (CRS)
print(catchment_norway.crs)
# Plot the shapefile
catchment_norway.plot();

In [None]:
# Create figure
fig, ax = plt.subplots(figsize=(12, 14))

# Plot all catchments in Norway with different colors
catchment_norway.plot(
    ax=ax,
    categorical=True,
    cmap="tab20",  # colorful and clear for categories
    legend=False,
    edgecolor="black",
    linewidth=0.5,
    alpha=0.7
)

# Plot the specific catchment with thick red border
catchment.boundary.plot(ax=ax, edgecolor="red", linewidth=2, label="Target Catchment")


# Final touches
ax.set_title("Norway Catchments with Highlighted Target Catchment", fontsize=16)
ax.set_axis_off()
plt.legend()
plt.tight_layout()
plt.show();


In [None]:
# Create figure
fig, ax = plt.subplots(figsize=(10, 12))
name = "navn"
# Plot all Norway catchments with colors
catchment_norway.plot(
    ax=ax,
    column=name if name in catchment_norway.columns else None,
    categorical=True,
    cmap="tab20",
    edgecolor="black",
    linewidth=0.5,
    alpha=0.7
)

# Overlay your catchment with a red border
catchment.boundary.plot(ax=ax, edgecolor="red", linewidth=2, label="Target Catchment")

# Zoom to the catchment
minx, miny, maxx, maxy = catchment.total_bounds
pad_x = (maxx - minx) * 0.05
pad_y = (maxy - miny) * 0.05
ax.set_xlim(minx - pad_x, maxx + pad_x)
ax.set_ylim(miny - pad_y, maxy + pad_y)

# Label only nearby regions (that intersect the catchment)
nearby_regions = catchment_norway[catchment_norway.intersects(catchment.union_all())]

if name in nearby_regions.columns:
    for idx, row in nearby_regions.iterrows():
        label_point = row.geometry.intersection(catchment.union_all()).centroid
        ax.text(label_point.x, label_point.y, str(row[name]), fontsize=9, ha='center', color="black")

# Title, formatting
ax.set_title("Zoomed-In Catchment Area with Nearby Region Labels", fontsize=14)
ax.set_axis_off()
plt.legend()
plt.tight_layout();
plt.show();


### Real data is provided per year: preprocessing and concatenating
#### Skip if real data is already fetched

In [None]:
# Define the folder containing the NetCDF files
folder_path = "data/raw/other_data/sde"

# List all NetCDF files in the folder
nc_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.nc')]

ds_list = []
for nc_file in nc_files:
    # Load the NetCDF file
    ds = xr.open_dataset(nc_file)
    # Print the dataset
    ds_list.append(ds)


In [None]:
#Function to check if all values in the snow_depth variable are NaN
def check_all_nan(nc_file):
    ds = xr.open_dataset(nc_file)
    if 'snow_depth' in ds:
        all_nan = ds['snow_depth'].isnull().all()
        return all_nan.item()  # Convert to a Python boolean
    else:
        print(f"No snow_depth variable found in {nc_file}")
        return None

# Check each NetCDF file for NaN values in the snow_depth variable
for nc_file in nc_files:
    all_nan = check_all_nan(nc_file)
    if all_nan is not None:
        if all_nan:
            print(f"All values in snow_depth are NaN in {nc_file}")
        else:
            print(f"Not all values in snow_depth are NaN in {nc_file}")

In [None]:
# Concatenate the aggregated datasets along the time dimension
ds_daily_concat = xr.concat(ds_list, dim='time')
# Sort time values, just in case they're out of order
ds_all = ds_daily_concat.sortby('time')
ds_all.to_netcdf("snow_depth_concatenated.nc")

### Fetch real data .nc

In [None]:
ds_all = xr.open_dataset("data/raw/snow_depth_concatenated.nc")

In [None]:
# Extract proj4 string from the dataset
proj4_str = ds_all['UTM_Zone_33'].attrs['proj4']
netcdf_crs = CRS.from_proj4(proj4_str)

print("Extracted CRS from NetCDF:", netcdf_crs)
# Reproject to NetCDF CRS
catchments_proj = catchment.to_crs(netcdf_crs)

In [None]:
# Select time slice (e.g., first time step)
ds_all.sel(time='2015-01-01')['snow_depth'].plot(figsize = (18,15))
plt.show()

In [None]:
# Extract Data
sde = ds_all['snow_depth'].isel(time=-1)  # First time step
x = ds_all['x'].values  # X-coordinates (projected)
y = ds_all['y'].values  # Y-coordinates (projected)



# 🔹 Fix Grid Cell Alignment by Creating an Edged Grid
X, Y = np.meshgrid(
    np.linspace(x.min(), x.max(), sde.shape[1] + 1),  # X should be 1 larger than sde.shape[1]
    np.linspace(y.min(), y.max(), sde.shape[0] + 1)   # Y should be 1 larger than sde.shape[0]
)

# 🔹 Plot the Data Using pcolormesh()
fig, ax = plt.subplots(figsize=(18, 15))

c = ax.pcolormesh(X, Y, sde.values, cmap="viridis", shading="flat")  # Corrected!

# Add a Colorbar
cb = plt.colorbar(c, ax=ax)
cb.set_label("Snow Depth (cm)")

# Overlay Catchment Shapefile (Correctly Reprojected)
catchment.boundary.plot(ax=ax, edgecolor="red", linewidth=2, label="Catchment Boundary")


# Formatting
ax.set_xlabel("X Coordinate (meters)")
ax.set_ylabel("Y Coordinate (meters)")
ax.set_title("Corrected Snow Depth Map with Catchment Overlay")
ax.set_aspect('equal')
plt.legend()

plt.show()


#### Zooming in 

In [None]:
# Extract Data

padding = 0.3
sde = ds_all['snow_depth'].isel(time=-1)  # First time step
x = ds_all['x'].values  # X-coordinates (projected)
y = ds_all['y'].values  # Y-coordinates (projected)



# 🔹 Fix Grid Cell Alignment by Creating an Edged Grid
X, Y = np.meshgrid(
    np.linspace(x.min(), x.max(), sde.shape[1] + 1),  # X should be 1 larger than sde.shape[1]
    np.linspace(y.min(), y.max(), sde.shape[0] + 1)   # Y should be 1 larger than sde.shape[0]
)

# 🔹 Plot the Data Using pcolormesh()
fig, ax = plt.subplots(figsize=(18, 15))

c = ax.pcolormesh(X, Y, sde.values, cmap="viridis", shading="flat")  # Corrected!

# Add a Colorbar
cb = plt.colorbar(c, ax=ax)
cb.set_label("Snow Depth (cm)")

# Overlay Catchment Shapefile (Correctly Reprojected)
catchment.boundary.plot(ax=ax, edgecolor="red", linewidth=2, label="Catchment Boundary")

# Get bounding box from catchment
minx, miny, maxx, maxy = catchment.total_bounds

# Optional padding to avoid clipping
pad_x = (maxx - minx) * padding  # 10% padding
pad_y = (maxy - miny) * padding

# Zoom to catchment extent
ax.set_xlim(minx - pad_x, maxx + pad_x)
ax.set_ylim(miny - pad_y, maxy + pad_y)

# Formatting
ax.set_xlabel("X Coordinate (meters)")
ax.set_ylabel("Y Coordinate (meters)")
ax.set_title("Corrected Snow Depth Map with Catchment Overlay")
ax.set_aspect('equal')
plt.legend()

plt.show()


In [None]:
cerra_df = pd.read_csv("cerra_snow_depth_ordered.csv")
cerra_df.sort_values(by=['latitude', 'longitude'], inplace=True)
# Get unique latitude and longitude pairs
coords_df = cerra_df[['latitude', 'longitude']].drop_duplicates()
coords_df.rename(columns={'latitude': 'lat', 'longitude': 'lon'}, inplace=True)

# Display the unique latitude and longitude pairs
print("Unique latitude and longitude pairs:")
print(coords_df)

In [None]:
cerra_df = pd.read_csv("data/processed/cerra_processed.csv")
# Get unique latitude and longitude pairs
coords_df = cerra_df[['latitude', 'longitude']].drop_duplicates()
coords_df.rename(columns={'latitude': 'lat', 'longitude': 'lon'}, inplace=True)

# Display the unique latitude and longitude pairs
print("Unique latitude and longitude pairs:")
print(coords_df)

In [None]:
real_ds = ds_all

### Create a geodataframe from real_ds with latitudes and longitudes

In [None]:
# 
# Extract lat/lon and flatten
lat_vals = real_ds['lat'].values
lon_vals = real_ds['lon'].values
snow_vals = real_ds['snow_depth'].isel(time=0).values  # just time=0 for now

# Flatten arrays
lat_flat = lat_vals.ravel()
lon_flat = lon_vals.ravel()
snow_flat = snow_vals.ravel()

# Create GeoDataFrame
gdf_real = gpd.GeoDataFrame({
    'lat': lat_flat,
    'lon': lon_flat,
    'snow_depth': snow_flat
}, geometry=[Point(lon, lat) for lon, lat in zip(lon_flat, lat_flat)], crs='EPSG:4326')

In [None]:
# Your lat/lon pairs from earlier
gdf_targets = gpd.GeoDataFrame(
    coords_df,
    geometry=gpd.points_from_xy(coords_df['lon'], coords_df['lat']),
    crs='EPSG:4326'
)

In [None]:
# Reproject both to UTM 33N
real_ds_proj = gdf_real.to_crs("EPSG:32633")
gdf_targets_proj = gdf_targets.to_crs("EPSG:32633")

# Now perform nearest join
matched = gpd.sjoin_nearest(gdf_targets_proj, real_ds_proj, how="left", distance_col="distance_m")

In [None]:
matched.head()

In [None]:

# 1. Flatten real_ds lat/lon
lat2d = real_ds['lat'].values
lon2d = real_ds['lon'].values
flat_coords = np.column_stack([lat2d.ravel(), lon2d.ravel()])

# 2. Build KDTree
tree = cKDTree(flat_coords)

# 3. Get unique CERRA coordinates
cerra_coords = cerra_df[['latitude', 'longitude']].drop_duplicates().values

# 4. Query nearest match
distances, flat_indices = tree.query(cerra_coords)
y_idx, x_idx = np.unravel_index(flat_indices, lat2d.shape)

In [None]:
cerra_to_grid_map = pd.DataFrame({
    'unit': np.arange(len(cerra_coords)),
    'cerra_lat': cerra_coords[:, 0],
    'cerra_lon': cerra_coords[:, 1],
    'grid_y': y_idx,
    'grid_x': x_idx,
    'real_lat': lat2d[y_idx, x_idx],
    'real_lon': lon2d[y_idx, x_idx],
    'distance_deg': distances
})

In [None]:
time = real_ds['time'].values
snow = real_ds['snow_depth']

records = []

for unit, y, x in zip(cerra_to_grid_map['unit'], y_idx, x_idx):
    ts = snow[:, y, x].values  # shape: (time,)
    for t, sde in zip(time, ts):
        records.append({
            'time': t,
            'unit': unit,
            'sde': sde
        })

real_df = pd.DataFrame(records)

real_df['real_sde'] = real_df['real_sde']/100
real_df.to_csv("real_snow_depth_ordered.csv", index=False)

### Open Real, Statkraft and CERRA dataframes and plot

In [None]:
# Pivot the DataFrame to have units as columns
real_df = pd.read_csv("real_snow_depth_ordered.csv")
real_df['time'] = pd.to_datetime(real_df['time'])
pivot_real = real_df.pivot_table(index='time', columns='unit', values='real_sde')
pivot_real = pivot_real[pivot_real.index <= "2020-03-01"]

statkraft_df = pd.read_csv("statkraft_snow_depth_ordered.csv")
statkraft_df['time'] = pd.to_datetime(statkraft_df['time'])
cerra_df['time'] = pd.to_datetime(cerra_df['time'])
# Pivot the DataFrame to have units as columns
pivot_cerra = cerra_df.pivot_table(index='time', columns='unit', values='cerra_sde')
# Pivot the DataFrame to have units as columns
pivot_statkraft = statkraft_df.pivot_table(index='time', columns='unit', values='statkraft_sde')

In [None]:
# Plot the time series for each unit with subplots
fig, axes = plt.subplots(nrows=len(pivot_cerra.columns), 
                         ncols=1, 
                         sharex=True, 
                         figsize=(15, 2 * len(pivot_cerra.columns)))

for i, unit in enumerate(pivot_cerra.columns):
    ax = axes[i]
    pivot_cerra[unit].plot(ax=ax, label='CERRA', color='green')
    pivot_statkraft[unit].plot(ax=ax, label='Statkraft', color='orange')
    pivot_real[unit].plot(ax=ax, label='real', color='blue')
    
    ax.set_title(f'Unit {unit}')
    ax.legend()

# Set the x-axis label for the last subplot
axes[-1].set_xlabel('Time')

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()