**Script Description:** This script loads a pre-processed dataset, and computes the Potential Evapotranspiration and Reference Evapotranspiration. 

**File Name:** 01_05_Compute_Evapotranspiration.ipynb

**Date:** 2025

**Created by:** Rob Alamgir  

#### Import relevant packages

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pyet
from pyet import penman
from pyet import abtew

# Use help() to view the available arguments
#help(pyet.penman)
#help(pyet.abtew)

#### Step 1: Load datasets and pre-process data

In [2]:
data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V4.csv"
complete_dataset = pd.read_csv(data_path, low_memory=False)

complete_dataset = complete_dataset.rename(columns={'date': 'Date'})
complete_dataset['Date'] = pd.to_datetime(complete_dataset['Date']) # Convert 'date' column to datetime format
#print(complete_dataset.info()) 

# Load NOBV Site data
NOBV_Site_Data = pd.read_csv("C:/Data_MSc_Thesis/NOBV_Site_Data/NOBV_EC_Tower_Data_Final.csv")

# Merge the two dataframes based on 'Source' and 'Site_ID'
merged_df = pd.merge(complete_dataset, 
                     NOBV_Site_Data[['Site_ID', 'EPSG_4326_WGS_84_Longitude_X', 'EPSG_4326_WGS_84_Latitude_Y','Elevation_m']],
                     left_on='Source', right_on='Site_ID', how='left')

# Check if the 'Source' and 'Site_ID' columns match in the merged dataframe
merged_df['Source_matches_Site_ID'] = merged_df['Source'] == merged_df['Site_ID']
#print(merged_df[merged_df['Source_matches_Site_ID'] == False])

# Remove unnecessary columns for analysis
filtered_df_V1  = merged_df.drop(columns=[ 'Source','Source_matches_Site_ID'])

# Preprocess the input data
filtered_df_V2  = pd.DataFrame({"Date":filtered_df_V1.Date,
                                "Site_ID":filtered_df_V1.Site_ID,
                                "T":filtered_df_V1.ATMP_f,                  # temperature in [C°]
                                "RH":filtered_df_V1.RHUM_f,                 # relative_humidity [%°]
                                "R":filtered_df_V1.SWIN_f*0.0864,           # convert solar radiation from [(W/m^2)day^1] to [MJ m-2 d-1]
                                "u2":filtered_df_V1.WINS_f,                 # windspeed [m/s]
                                "PAIR":filtered_df_V1.PAIR_f*0.1,           # convert air pressure in [hPa] to [kPa]
                                "latitude":filtered_df_V1.EPSG_4326_WGS_84_Latitude_Y,
                                "Elevation":filtered_df_V1.Elevation_m})    # Elevation in [m] # Latitude of the meteorological station

# Remove rows with NaN values
filtered_df_V3 = filtered_df_V2.dropna()
#print(filtered_df_V3.info()) 

#### Step 2: Calculate Potential Evapotranspiration (PET) using Makkink method 

In [3]:
# Calculate Potential Evapotranspiration (PET) using Makkink method for all rows at once (vectorized)
filtered_df_V3.loc[:, 'PET'] = pyet.makkink(
    tmean=filtered_df_V3['T'], 
    rs=filtered_df_V3['R'], 
    pressure=filtered_df_V3['PAIR'],
    elevation=filtered_df_V3['Elevation'],
    clip_zero=False)  # Avoid clipping of negative values

#print(filtered_df_V3.head())
#print(filtered_df_V3.info()) 
#print(filtered_df_V3.describe())

#### Step 3: Calculate reference evapotranspiration (ETo) using abtew method

In [4]:
# Calculate ET0 using Abtew method for all rows at once (vectorized) and add it directly to the original DataFrame
filtered_df_V3.loc[:, 'ET0'] = abtew(
    tmean=filtered_df_V3['T'],  # Average daily temperature in °C
    rs=filtered_df_V3['R'],     # Solar radiation in MJ m²/day
    clip_zero=True)             # Option to clip negative values (optional)

#print(filtered_df_V3.info()) 
#print(filtered_df_V3.describe())

In [5]:
print(complete_dataset.info())
print(filtered_df_V3.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8870 entries, 0 to 8869
Data columns (total 74 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         8870 non-null   datetime64[ns]
 1   year_month                   8870 non-null   object        
 2   year_week                    8870 non-null   object        
 3   DOY                          8870 non-null   float64       
 4   Source                       8870 non-null   object        
 5   SWCT_1_005                   8173 non-null   float64       
 6   SWCT_1_015                   8365 non-null   float64       
 7   SWCT_1_025                   8369 non-null   float64       
 8   SWCT_1_035                   8370 non-null   float64       
 9   SWCT_1_045                   8374 non-null   float64       
 10  SWCT_1_055                   8372 non-null   float64       
 11  SWCT_1_065                   8374 non-null 

In [6]:
# Rename 'Source' to 'Site_ID' in complete_dataset
complete_dataset = complete_dataset.rename(columns={'Source': 'Site_ID'})

# Merge with filtered_df_V3 based on 'Date' and 'Site_ID'
complete_dataset = complete_dataset.merge(
    filtered_df_V3[['Date', 'Site_ID', 'PET', 'ET0']], 
    on=['Date', 'Site_ID'], 
    how='left')

# Reorder the columns by popping PET and ET0 and inserting them after ET
cols = list(complete_dataset.columns)
cols.remove('PET')
cols.remove('ET0')

# Find the index of 'ET' and insert 'PET' and 'ET0' after it
et_index = cols.index('ET') + 1
cols[et_index:et_index] = ['PET', 'ET0']

# Reassign the dataframe with the new column order
complete_dataset = complete_dataset[cols]

In [7]:
complete_dataset["P_minus_ET"] = complete_dataset["RAIN_f"] - complete_dataset["ET"]
complete_dataset["P_minus_PET"] = complete_dataset["RAIN_f"] - complete_dataset["PET"]

cols = list(complete_dataset.columns)  # Get the list of columns
idx_et0 = cols.index("ET0") + 1        # Position after ET0
cols = cols[:idx_et0] + ["P_minus_ET", "P_minus_PET"] + cols[idx_et0:]
complete_dataset = complete_dataset[cols]  # Reorder the dataframe

In [8]:
print(complete_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8870 entries, 0 to 8869
Data columns (total 80 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         8870 non-null   datetime64[ns]
 1   year_month                   8870 non-null   object        
 2   year_week                    8870 non-null   object        
 3   DOY                          8870 non-null   float64       
 4   Site_ID                      8870 non-null   object        
 5   SWCT_1_005                   8173 non-null   float64       
 6   SWCT_1_015                   8365 non-null   float64       
 7   SWCT_1_025                   8369 non-null   float64       
 8   SWCT_1_035                   8370 non-null   float64       
 9   SWCT_1_045                   8374 non-null   float64       
 10  SWCT_1_055                   8372 non-null   float64       
 11  SWCT_1_065                   8374 non-null 

### Step 4: Export the final dataframe to a CSV file

In [9]:
output_path = "C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V5.csv"  # Update the path as needed
complete_dataset.to_csv(output_path, index=False)

print(f"DataFrame successfully saved to {output_path}")

DataFrame successfully saved to C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V5.csv
