# Land Usage Variables Generation

In this notebook we use data on Land Usage from 'Corine Land Cover 2018' to generate a set of exogenous variables for our dataset, which could contribute to better forecasts. Data is avalilable in the following link: https://land.copernicus.eu/api/en/products/corine-land-cover/clc2018

#### Loading libraries

In [None]:
import rasterio
import pyproj
import pandas as pd
import numpy as np
from collections import Counter

#### Loading auxiliary data

In [2]:
df_coord = pd.read_csv('auxiliary_data/gw_coordinates_df.csv')

#### Defining all possible land usage values

In [None]:
# Diccionario de tipos de suelo permitidos
soil_types = {
    1: 'Continuous urban fabric',
    2: 'Discontinuous urban fabric',
    3: 'Industrial or commercial units',
    4: 'Road and rail networks and associated land',
    5: 'Port areas',
    6: 'Airports',
    7: 'Mineral extraction sites',
    8: 'Dump sites',
    9: 'Construction sites',
    10: 'Green urban areas',
    11: 'Sport and leisure facilities',
    12: 'Non-irrigated arable land',
    13: 'Permanently irrigated land',
    14: 'Rice fields',
    15: 'Vineyards',
    16: 'Fruit trees and berry plantations',
    17: 'Olive groves',
    18: 'Pastures',
    19: 'Annual crops associated with permanent crops',
    20: 'Complex cultivation patterns',
    21: 'Land principally occupied by agriculture with significant areas of natural vegetation',
    22: 'Agro-forestry areas',
    23: 'Broad-leaved forest',
    24: 'Coniferous forest',
    25: 'Mixed forest',
    26: 'Natural grasslands',
    27: 'Moors and heathland',
    28: 'Sclerophyllous vegetation',
    29: 'Transitional woodland-shrub',
    30: 'Beaches dunes sands',
    31: 'Bare rocks',
    32: 'Sparsely vegetated areas',
    33: 'Burnt areas',
    34: 'Glaciers and perpetual snow',
    35: 'Inland marshes',
    36: 'Peat bogs',
    37: 'Salt marshes',
    38: 'Salines',
    39: 'Intertidal flats',
    40: 'Water courses',
    41: 'Water bodies',
    42: 'Coastal lagoons',
    43: 'Estuaries',
    44: 'Sea and ocean',
    45: 'NODATA'
}

### Defining necessary functions

In [1]:
def latlon_to_raster_coords(lat, lon, transform):
    """
    Convert latitude/longitude coordinates to pixel coordinates in a raster file.

    Args:
        lat (float): Latitude of the point.
        lon (float): Longitude of the point.
        transform (Affine): Affine transformation object from the raster file.

    Returns:
        (int, int): Row and column in the raster grid corresponding to the given latitude/longitude.
    """
    # Use a transformer to convert from EPSG:4326 (WGS84) to EPSG:3035 (LAEA Europe)
    transformer = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:3035", always_xy=True)
    
    # Transform latitude/longitude into projected coordinates (in meters)
    x, y = transformer.transform(lon, lat)
    
    # Use the raster file's geotransformation to convert projected coordinates to pixel indices
    col = (x - transform[2]) / transform[0]  # x (East) -> column
    row = (y - transform[5]) / transform[4]  # y (North) -> row
    return int(row), int(col)


def get_value_proportions(dataset, row, col, window_size=3):
    """
    Calculate the proportion of valid soil type values in a window centered at the given pixel coordinates.

    Args:
        dataset (rasterio.DatasetReader): The raster dataset to extract values from.
        row (int): The row index of the central pixel.
        col (int): The column index of the central pixel.
        window_size (int): Size of the window around the central pixel (default is 3).

    Returns:
        dict: A dictionary mapping soil type names to their respective proportions within the window.
              If no valid data is found, the function returns {'no_data': 1.0}.
    """
    # Define the window centered around the (row, col) with a specific size
    half_window = window_size // 2
    window = rasterio.windows.Window(col - half_window, row - half_window, window_size, window_size)

    # Read pixel values within the window from the dataset
    data = dataset.read(1, window=window)

    # Determine the nodata value from the dataset; default to -128 if undefined
    nodata_value = dataset.nodata if dataset.nodata is not None else -128

    # Convert nodata values (-128) to NaN for easier handling
    data = np.where(data == nodata_value, np.nan, data)

    # Filter values that belong to the valid soil types defined in the dictionary
    valid_soil_values = list(soil_types.keys())  # Valid soil type values
    valid_data = np.isin(data, valid_soil_values, assume_unique=True)

    # Extract valid data, ignoring NaNs
    valid_data = data[valid_data & ~np.isnan(data)]

    # If no valid data is found, return a proportion of 1.0 for 'no_data'
    if valid_data.size == 0:
        return {'no_data': 1.0}

    # Count the occurrences of each valid value in the window
    counts = Counter(valid_data.flatten())

    # Calculate proportions for each value in the window
    total = sum(counts.values())
    proportions = {soil_types.get(k, k): v / total for k, v in counts.items()}  # Map soil types to their names
    
    return proportions

### Get the land usage for each set of coordinates

In [3]:
# Load the GeoTIFF file
file_path = 'auxiliary_data/U2018_CLC2018_V2020_20u1.tif'
with rasterio.open(file_path) as dataset:
    # Get the coordinate reference system (CRS) of the GeoTIFF (EPSG:3035)
    dst_crs = dataset.crs

    # Get the geotransformation of the raster file
    transform = dataset.transform

    # Create a list to store the proportion of each land type for each row
    proportion_list = []

    # Iterate over the rows in the DataFrame containing latitude and longitude coordinates
    for index, row in df_coord.iterrows():
        lat = row['latitude']
        lon = row['longitude']

        # Convert latitude/longitude to pixel coordinates in the raster file
        row_raster, col_raster = latlon_to_raster_coords(lat, lon, transform)
        
        # Calculate the proportion of land use types in a window around the coordinates
        proportions = get_value_proportions(dataset, row_raster, col_raster, window_size=20)
        
        # Store the calculated proportions
        proportion_list.append(proportions)

    # Add the proportions as a new column in the DataFrame
    df_coord['value_proportions'] = proportion_list

In [4]:
df_coord.head()

Unnamed: 0,id_loc,latitude,longitude,value_proportions
0,324095,48.310278,14.3075,"{'Discontinuous urban fabric': 0.33, 'Industri..."
1,323295,48.330278,14.302778,"{'Discontinuous urban fabric': 0.49, 'Broad-le..."
2,323154,48.283056,14.349444,"{'Pastures': 0.045, 'Water courses': 0.205, 'D..."
3,304535,48.306111,16.872222,"{'Non-irrigated arable land': 0.5825, 'Land pr..."
4,326934,47.915833,16.289167,"{'Non-irrigated arable land': 0.27, 'Land prin..."


### Transform the results into variables of a dataframe

In [5]:
def expand_value_proportions(row):
    """
    Convert the dictionary of soil type proportions into individual columns for each soil type.

    Args:
        row (pd.Series): A row of the DataFrame containing the 'value_proportions' dictionary.

    Returns:
        pd.Series: A new series with each soil type as a column and its corresponding proportion as the value.
    """
    proportions = row['value_proportions']
    expanded_row = {}
    
    # Iterate through the soil types in the 'value_proportions' dictionary
    for soil_type_name in proportions.keys():
        # Assign the proportion value to the corresponding soil type name as a key
        # If a soil type is missing, default its proportion to 0.0
        expanded_row[soil_type_name] = proportions.get(soil_type_name, 0.0)
    
    return pd.Series(expanded_row)

# Apply the transformation to each row in the DataFrame to expand the soil type proportions into columns
df_expanded = df_coord.apply(expand_value_proportions, axis=1)

# Concatenate the expanded soil type columns with the original DataFrame
df_coord = pd.concat([df_coord, df_expanded], axis=1)

In [8]:
df_coord.head()

Unnamed: 0,id_loc,latitude,longitude,value_proportions,Airports,Broad-leaved forest,Complex cultivation patterns,Coniferous forest,Construction sites,Continuous urban fabric,...,Non-irrigated arable land,Pastures,Port areas,Road and rail networks and associated land,Sparsely vegetated areas,Sport and leisure facilities,Transitional woodland-shrub,Vineyards,Water bodies,Water courses
0,324095,48.310278,14.3075,"{'Discontinuous urban fabric': 0.33, 'Industri...",0.0,0.0,0.0,0.0,0.0,0.0675,...,0.0,0.0,0.085,0.0,0.0,0.06,0.0,0.0,0.0,0.12
1,323295,48.330278,14.302778,"{'Discontinuous urban fabric': 0.49, 'Broad-le...",0.0,0.025,0.0,0.0,0.0,0.105,...,0.0,0.105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045
2,323154,48.283056,14.349444,"{'Pastures': 0.045, 'Water courses': 0.205, 'D...",0.0,0.23,0.0,0.0,0.0,0.0,...,0.0,0.045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205
3,304535,48.306111,16.872222,"{'Non-irrigated arable land': 0.5825, 'Land pr...",0.0,0.165,0.0,0.0,0.0,0.0,...,0.5825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,326934,47.915833,16.289167,"{'Non-irrigated arable land': 0.27, 'Land prin...",0.0,0.1725,0.0,0.0,0.0,0.0,...,0.27,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_coord = df_coord.fillna(0)

### Saving the resulting file

In [9]:
df_coord.to_csv("land_cover_usage_full.csv", index = False)