Script Description: This script extracts all variables avaiable from the BIS-4D datasets for the NOBV locations into one CSV file.

File Name: 01_08_Extract_BIS_4D_Data.ipynb

Date: 2025

Created by: Rob Alamgir

Version: 1.0

References:

#### Import the relevant packages

In [1]:
import os
import glob
import rasterio
import geopandas as gpd
import pandas as pd
from rasterio.transform import rowcol

#### Import the relevant data files

In [5]:
data_dir = "C:/Data_MSc_Thesis/BIS_4D_Selected/"
tif_files = glob.glob(os.path.join(data_dir, "*.tif"))

point_data_path = "C:/Data_MSc_Thesis/NOBV_Site_Data/NOBV_EC_Tower_Data_Final.csv"
point_data = pd.read_csv(point_data_path)
point_data.head()

Unnamed: 0,Site_no,Location_No,Site_ID,EPSG_4326_WGS_84_Longitude_X,EPSG_4326_WGS_84_Latitude_Y,EPSG_32631_WGS 84_X_m,EPSG_32631_WGS 84_Y_m,Elevation_m
0,1,1,ALB_MS,5.902334,53.05356,694512.5721,5882167.358,1.1
1,2,1,ALB_RF,5.904631,53.053385,694667.2798,5882154.181,1.1
2,3,2,AMM,5.903505,53.031374,694691.0225,5879703.421,1.1
3,4,2,AMR,5.902991,53.032245,694652.6416,5879798.861,1.1
4,5,3,ANK_PT,5.097471,52.253916,643168.4419,5791352.667,-1.4


In [4]:
tif_files  # List all .tif files in the directory

['C:/Data_MSc_Thesis/BIS_4D_Selected\\BD_gcm3_d_0_5_QRF_pred_mean.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\BD_gcm3_d_5_15_QRF_pred_mean.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\clay_per_d_0_5_QRF_pred_mean_processed.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\clay_per_d_5_15_QRF_pred_mean_processed.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\sand_per_d_0_5_QRF_pred_mean_processed.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\sand_per_d_5_15_QRF_pred_mean_processed.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\silt_per_d_0_5_QRF_pred_mean_processed.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\silt_per_d_5_15_QRF_pred_mean_processed.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\SOM_per_2020_d_0_5_QRF_pred_mean.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\SOM_per_2020_d_5_15_QRF_pred_mean.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\SOM_per_2023_d_0_5_QRF_pred_mean.tif',
 'C:/Data_MSc_Thesis/BIS_4D_Selected\\SOM_per_2023_d_5_15_QRF_pred_mean.tif']

#### Pre-process the datasets

In [6]:
point_data.rename(columns={"EPSG_4326_WGS_84_Longitude_X": "Longitude",
                           "EPSG_4326_WGS_84_Latitude_Y": "Latitude"}, inplace=True)

# Convert dataframe to a GeoDataFrame
gdf = gpd.GeoDataFrame(point_data, geometry=gpd.points_from_xy(point_data.Longitude, point_data.Latitude), crs="EPSG:4326")
gdf = gdf.to_crs("EPSG:28992")   # Reproject to match raster CRS (EPSG:28992)

# Extract reprojected coordinates
gdf["Reproj_X"] = gdf.geometry.x
gdf["Reproj_Y"] = gdf.geometry.y

gdf.info()
#gdf.head(12)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   Site_no                21 non-null     int64   
 1   Location_No            21 non-null     int64   
 2   Site_ID                21 non-null     object  
 3   Longitude              21 non-null     float64 
 4   Latitude               21 non-null     float64 
 5   EPSG_32631_WGS 84_X_m  21 non-null     float64 
 6   EPSG_32631_WGS 84_Y_m  21 non-null     float64 
 7   Elevation_m            21 non-null     float64 
 8   geometry               21 non-null     geometry
 9   Reproj_X               21 non-null     float64 
 10  Reproj_Y               21 non-null     float64 
dtypes: float64(7), geometry(1), int64(2), object(1)
memory usage: 1.9+ KB


#### Loop through each raster and extract values

In [7]:
# Function to extract raster values at given coordinates
def extract_raster_values(raster_path, points_gdf):
    with rasterio.open(raster_path) as src:
        coords = [(x, y) for x, y in zip(points_gdf["Reproj_X"], points_gdf["Reproj_Y"])]
        values = [val[0] if val else None for val in src.sample(coords)]
    return values

for tif_file in tif_files:
    raster_name = os.path.basename(tif_file).replace('.tif', '')
    gdf[f"{raster_name}_values"] = extract_raster_values(tif_file, gdf)

In [8]:
# Define column renaming dictionary
rename_dict = {
    "BD_gcm3_d_0_5_QRF_pred_mean_values": "BD_0_5",
    "BD_gcm3_d_5_15_QRF_pred_mean_values": "BD_5_15",
    "clay_per_d_0_5_QRF_pred_mean_processed_values": "Clay_0_5",
    "clay_per_d_5_15_QRF_pred_mean_processed_values": "Clay_5_15",
    "SOM_per_2020_d_0_5_QRF_pred_mean_values": "SOM_2020_0_5",
    "SOM_per_2020_d_5_15_QRF_pred_mean_values": "SOM_2020_5_15",
    "SOM_per_2023_d_0_5_QRF_pred_mean_values": "SOM_2023_0_5",
    "SOM_per_2023_d_5_15_QRF_pred_mean_values": "SOM_2023_5_15",
    "sand_per_d_0_5_QRF_pred_mean_processed_values": "Sand_0_5",
    "sand_per_d_5_15_QRF_pred_mean_processed_values": "Sand_5_15",
    "silt_per_d_0_5_QRF_pred_mean_processed_values": "Silt_0_5",
    "silt_per_d_5_15_QRF_pred_mean_processed_values": "Silt_5_15"
}

gdf.rename(columns=rename_dict, inplace=True)   # Rename columns in the GeoDataFrame
gdf.drop(columns=["geometry", ], inplace=True)

In [9]:
gdf.info()
#gdf.head(12)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Site_no                21 non-null     int64  
 1   Location_No            21 non-null     int64  
 2   Site_ID                21 non-null     object 
 3   Longitude              21 non-null     float64
 4   Latitude               21 non-null     float64
 5   EPSG_32631_WGS 84_X_m  21 non-null     float64
 6   EPSG_32631_WGS 84_Y_m  21 non-null     float64
 7   Elevation_m            21 non-null     float64
 8   Reproj_X               21 non-null     float64
 9   Reproj_Y               21 non-null     float64
 10  BD_0_5                 21 non-null     float32
 11  BD_5_15                21 non-null     float32
 12  Clay_0_5               21 non-null     float32
 13  Clay_5_15              21 non-null     float32
 14  Sand_0_5               21 non-null     float32
 15  

#### Export the final dataframe

In [10]:
output_path = "C:/Data_MSc_Thesis/BIS_4D_Selected/NOBV_Point_Data_Extracted_V1.csv"
gdf.to_csv(output_path, index=False)

print(f"Data has been successfully exported to '{output_path}'")

Data has been successfully exported to 'C:/Data_MSc_Thesis/BIS_4D_Selected/NOBV_Point_Data_Extracted_V1.csv'
