Script Description: Merge all the individual datasets to one pre-processed dataset

File Name: 01_09_Final_Merging.ipynb

Date: 2025

Created by: Rob Alamgir

Version: 1.0

References:

#### Import the relevant packages

In [1]:
import os
import glob
import pandas as pd

#### Import all the relevant datasets

In [2]:
base_dir = "C:/Data_MSc_Thesis/"

# Get file paths of Remotely Sensed Data
Planet_SWC_CSV_Files = glob.glob(os.path.join(base_dir, "Planet/NOBV_Planet_Inc_Data", "*Planet.csv"))
S1_SAR_VSM_CSV_Files = glob.glob(os.path.join(base_dir, "S1_SAR_VSM/Final_S1_SAR_VSM_Data", "*VSM.csv"))
S1_Backscatter_CSV_Files = glob.glob(os.path.join(base_dir, "S1_SAR_Backscatter", "*Scat.csv"))
S2_Indices_CSV_Files = glob.glob(os.path.join(base_dir, "S2_Indices", "*.csv"))
L8_9_LST_CSV_Files = glob.glob(os.path.join(base_dir, "L8_L9_LST", "*.csv"))
MODIS_CSV_Files = glob.glob(os.path.join(base_dir, "MODIS_LAI", "*.csv"))

# Get file paths of Hybrid Sensed Data
OWASIS_CSV_Files = glob.glob(os.path.join(base_dir, "OWASIS/EC_Tower_OWASIS_Data", "*OWASIS.csv"))

BIS_4D_Data = pd.read_csv("C:/Data_MSc_Thesis/BIS_4D_Selected/NOBV_Point_Data_Extracted.csv")    # Load preprocessed Dataset
BOFEK_Data = pd.read_csv("C:/Data_MSc_Thesis/BOFEK/BOFEK_NOBV.csv")                              # Load preprocessed Dataset

EC_Tower_Data = pd.read_csv("C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V5.csv") # Load preprocessed Dataset

#### Perform dataset pre-proceesing for all the individual datasets prior to merging

In [3]:
# Function to read a file and add a Source column
def read_and_add_name(file):
    base_name = os.path.splitext(os.path.basename(file))[0]
    df = pd.read_csv(file)
    df['Source'] = base_name
    return df

# Load and process S1_SAR_VSM data
S1_SAR_VSM_merged = pd.concat([read_and_add_name(file) for file in S1_SAR_VSM_CSV_Files], ignore_index=True)
S1_SAR_VSM_merged.rename(columns={"system:time_start": "Date", "VSM":"S1_VSM", "Source":"Site_ID"}, inplace=True)
S1_SAR_VSM_merged.drop(columns=["pixel_count"], inplace=True)
S1_SAR_VSM_merged['Date'] = pd.to_datetime(S1_SAR_VSM_merged['Date'], errors='coerce')
S1_SAR_VSM_merged = S1_SAR_VSM_merged[S1_SAR_VSM_merged["Date"] >= "2020-01-01"]
S1_SAR_VSM_merged['Site_ID'] = S1_SAR_VSM_merged['Site_ID'].str.replace(r'^S1_SAR_|_VSM$', '', regex=True)

# Load and process S1_Backscatter data
S1_Backscatter_merged = pd.concat([read_and_add_name(file) for file in S1_Backscatter_CSV_Files], ignore_index=True)
S1_Backscatter_merged.rename(columns={"mean": "S1_Backscatter", "Source":"Site_ID"}, inplace=True)
S1_Backscatter_merged.drop(columns=["count"], inplace=True)
S1_Backscatter_merged['Date'] = pd.to_datetime(S1_Backscatter_merged['Date'], errors='coerce')
S1_Backscatter_merged = S1_Backscatter_merged[S1_Backscatter_merged["Date"] >= "2020-01-01"]
S1_Backscatter_merged['Date'] = S1_Backscatter_merged['Date'].dt.date
S1_Backscatter_merged['Date'] = pd.to_datetime(S1_Backscatter_merged['Date'], errors='coerce')
S1_Backscatter_merged = S1_Backscatter_merged[S1_Backscatter_merged["Site_ID"] != "S1_SAR_WRW_OW_Back_Scat"]
S1_Backscatter_merged['Site_ID'] = S1_Backscatter_merged['Site_ID'].str.replace(r'^S1_SAR_|_Back_Scat$', '', regex=True)

# Load and process Planet_SWC data
Planet_SWC_merged = pd.concat([read_and_add_name(file) for file in Planet_SWC_CSV_Files], ignore_index=True)
Planet_SWC_merged.rename(columns={"timestamp": "Date", "Source":"Site_ID"}, inplace=True)
Planet_SWC_merged['Date'] = pd.to_datetime(Planet_SWC_merged['Date'], errors='coerce')
Planet_SWC_merged = Planet_SWC_merged[Planet_SWC_merged["Site_ID"] != "WRW_OW_SWC_Planet"]
Planet_SWC_merged = Planet_SWC_merged[Planet_SWC_merged["Date"] >= "2020-01-01"]
Planet_SWC_merged['Date'] = Planet_SWC_merged['Date'].dt.date
Planet_SWC_merged['Date'] = pd.to_datetime(Planet_SWC_merged['Date'], errors='coerce')
Planet_SWC_merged['Site_ID'] = Planet_SWC_merged['Site_ID'].str.replace(r'^|_SWC_Planet$', '', regex=True)
Planet_SWC_merged["Site_ID"] = Planet_SWC_merged["Site_ID"].replace({"AMM_RF": "AMM", "AMR_RF": "AMR"})


# Load and process OWASIS data
OWASIS_merged = pd.concat([read_and_add_name(file) for file in OWASIS_CSV_Files], ignore_index=True)
OWASIS_merged.rename(columns={"date": "Date", "Source":"Site_ID"}, inplace=True)
OWASIS_merged.drop(columns=["V1", "x", "y"], inplace=True, errors="ignore")
OWASIS_merged['Date'] = pd.to_datetime(OWASIS_merged['Date'], errors='coerce')
OWASIS_merged = OWASIS_merged.dropna(subset=["Date"])
OWASIS_merged = OWASIS_merged[OWASIS_merged["Date"] >= "2020-01-01"]
OWASIS_merged['Site_ID'] = OWASIS_merged['Site_ID'].str.replace(r'^|_OWASIS$', '', regex=True)

# Read and merge S2_Indices Data
S2_Indices_merged = pd.concat([read_and_add_name(file) for file in S2_Indices_CSV_Files], ignore_index=True)
S2_Indices_merged = S2_Indices_merged.rename(columns={'NDVI': 'S2_NDVI', 'EVI': 'S2_EVI', 'NDMI': 'S2_NDMI', 'Source':'Site_ID'})
S2_Indices_merged.drop(columns=['MNDWI', 'STR'], errors='ignore', inplace=True)
S2_Indices_merged['Date'] = pd.to_datetime(S2_Indices_merged['Date'], errors='coerce')
S2_Indices_merged.dropna(inplace=True)
S2_Indices_merged = S2_Indices_merged[S2_Indices_merged["Date"] >= "2020-01-01"]
S2_Indices_merged['Site_ID'] = S2_Indices_merged['Site_ID'].str.replace(r'^S2_Indices_|$', '', regex=True)

# Read and merge L8_9_LST Data
L8_9_LST_merged = pd.concat([read_and_add_name(file) for file in L8_9_LST_CSV_Files], ignore_index=True)
L8_9_LST_merged = L8_9_LST_merged.rename(columns={'Mean_Surface_Temperature': 'L8_9_LST', 'Source':'Site_ID'})
L8_9_LST_merged['Date'] = pd.to_datetime(L8_9_LST_merged['Date'], errors='coerce')
L8_9_LST_merged.drop(columns=['system:index', '.geo'], errors='ignore', inplace=True)
L8_9_LST_merged = L8_9_LST_merged.loc[:, ~L8_9_LST_merged.columns.duplicated()]
L8_9_LST_merged = L8_9_LST_merged[L8_9_LST_merged["Date"] >= "2020-01-01"]
L8_9_LST_merged['Site_ID'] = L8_9_LST_merged['Site_ID'].str.replace(r'^L8_9_LST_|$', '', regex=True)

# Read and merge MODIS LAI Data
MODIS_LAI_merged = pd.concat([read_and_add_name(file) for file in MODIS_CSV_Files], ignore_index=True)
MODIS_LAI_merged = MODIS_LAI_merged.rename(columns={'Mean_LAI': 'MODIS_LAI', 'Source':'Site_ID'})
MODIS_LAI_merged['Date'] = pd.to_datetime(MODIS_LAI_merged['Date'], errors='coerce')
MODIS_LAI_merged.drop(columns=['system:index', '.geo', 'Unnamed: 0'], errors='ignore', inplace=True)
MODIS_LAI_merged = MODIS_LAI_merged.loc[:, ~MODIS_LAI_merged.columns.duplicated()]
MODIS_LAI_merged = MODIS_LAI_merged[MODIS_LAI_merged["Date"] >= "2020-01-01"]
MODIS_LAI_merged['Site_ID'] = MODIS_LAI_merged['Site_ID'].str.replace(r'^MODIS_LAI_|$', '', regex=True)

# Pre-process BIS_4D_Data
BIS_4D_Data.drop(columns=['Site_no', 'Location_No', 'Longitude', 'Latitude','EPSG_32631_WGS.84_X_m',
                          'EPSG_32631_WGS.84_Y_m','Reproj_X','Reproj_Y'], errors='ignore', inplace=True)
# Pre-process BOFEK_Data
BOFEK_Data.drop(columns=['Site_Name', 'BOFEK_2020_PU_Description'], errors='ignore', inplace=True)

# Pre-process EC Tower Data
EC_Tower_Data['Date'] = pd.to_datetime(EC_Tower_Data['Date'], errors='coerce')

#### Merge all the individual datasets to one dataframe

In [4]:
# List of dataframes with 'Date' column
time_dependent_dfs = [
    S1_SAR_VSM_merged,
    S1_Backscatter_merged,
    Planet_SWC_merged,
    OWASIS_merged,
    S2_Indices_merged,
    L8_9_LST_merged,
    MODIS_LAI_merged,
    EC_Tower_Data
]

# Merge all time-dependent datasets on 'Date' and 'Site_ID'
merged_df = time_dependent_dfs[0]
for df in time_dependent_dfs[1:]:
    merged_df = merged_df.merge(df, on=['Date', 'Site_ID'], how='outer')

# Merge non-time-dependent datasets based on 'Site_ID'
merged_df = merged_df.merge(BIS_4D_Data, on='Site_ID', how='left')
merged_df = merged_df.merge(BOFEK_Data, on='Site_ID', how='left')

#### Re-order the columns of the merged dataframe

In [5]:
# Get the list of columns
cols = merged_df.columns.to_list()

# Move 'DOY' after 'Date' and 'Site_ID' after 'DOY'
cols.remove('DOY')
cols.remove('Site_ID')
cols.insert(1, 'DOY')  # Insert 'DOY' at index 1 (after 'Date')
cols.insert(2, 'Site_ID')  # Insert 'Site_ID' at index 2 (after 'DOY')

# Reorder the dataframe
merged_df = merged_df[cols]

In [6]:
# Check missing values
print(merged_df.info())
#print(merged_df.head(30))
#print(merged_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 91 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         36275 non-null  datetime64[ns]
 1   DOY                          8873 non-null   float64       
 2   Site_ID                      36275 non-null  object        
 3   S1_VSM                       17022 non-null  float64       
 4   S1_Backscatter               17022 non-null  float64       
 5   Planet_SWC                   35314 non-null  float64       
 6   Available_soil_storage_mm    12031 non-null  float64       
 7   S2_NDVI                      4260 non-null   float64       
 8   S2_EVI                       4260 non-null   float64       
 9   S2_NDMI                      4260 non-null   float64       
 10  L8_9_LST                     1565 non-null   float64       
 11  MODIS_LAI                    9032 non-nul

#### Export the final dataframe to a CSV file

In [8]:
output_path = "C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V6.csv"  # Update the path as needed
merged_df.to_csv(output_path, index=False)

print(f"DataFrame successfully saved to {output_path}")

DataFrame successfully saved to C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V6.csv


In [None]:
# Print summaries
#print(S1_SAR_VSM_merged.info())
#print(S1_Backscatter_merged.info())
#print(Planet_SWC_merged.info())
#print(OWASIS_merged.info())
#print(S2_Indices_merged.info())
#print(L8_9_LST_merged.info())
#print(MODIS_LAI_merged.info())
#print(BIS_4D_Data.info()) 
#print(BOFEK_Data.info())
#print(EC_Tower_Data.info())

# Print summaries
#print(S1_SAR_VSM_merged.head(10))
#print(S1_Backscatter_merged.head(10))
#print(Planet_SWC_merged.head(10))
#print(OWASIS_merged.head(10))
#print(S2_Indices_merged.head(10))
#print(L8_9_LST_merged.head(10))
#print(MODIS_LAI_merged.head(20))
#print(BIS_4D_Data.head(20)) 
#print(BOFEK_Data.head(20))
#print(EC_Tower_Data.head(10))