Script Description:
Perform temporal aggregation (daily averages) of the merged dataframe.

File Name: 01_04_Temporal_Aggregation_EC_Tower Data.ipynb

Date: 2025

Created by: Rob Alamgir

Version: 1.0

References:

#### Import the relevant packages

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

#### Load and preprocess data

In [2]:
data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V3.csv"
complete_dataset = pd.read_csv(data_path, low_memory=False)
complete_dataset['datetime'] = pd.to_datetime(complete_dataset['datetime'], errors='coerce')
complete_dataset['DOY'] = pd.to_numeric(complete_dataset['DOY'], errors='coerce')
complete_dataset = complete_dataset.drop(columns=['daytime'])                
complete_dataset['date'] = complete_dataset['datetime'].dt.date   
complete_dataset['date'] = pd.to_datetime(complete_dataset['date'], errors='coerce')

In [3]:
print(complete_dataset.info()) 
#complete_dataset.head(15)
#complete_dataset.tail(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425308 entries, 0 to 425307
Data columns (total 71 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   datetime                     425299 non-null  datetime64[ns]
 1   date                         425299 non-null  datetime64[ns]
 2   DOY                          425191 non-null  float64       
 3   Source                       425308 non-null  object        
 4   SWCT_1_005                   390173 non-null  float64       
 5   SWCT_1_015                   405102 non-null  float64       
 6   SWCT_1_025                   405250 non-null  float64       
 7   SWCT_1_035                   405140 non-null  float64       
 8   SWCT_1_045                   405447 non-null  float64       
 9   SWCT_1_055                   405289 non-null  float64       
 10  SWCT_1_065                   405442 non-null  float64       
 11  SWCT_1_075                

#### Compute the Monthly and Weekly Total Rainfall 

In [4]:
# Ensure 'date' column is in datetime format
complete_dataset['date'] = pd.to_datetime(complete_dataset['date'])

# Create new columns for year-month and year-week
complete_dataset['year_month'] = complete_dataset['date'].dt.to_period('M').astype(str)
complete_dataset['year_week'] = complete_dataset['date'].dt.to_period('W').astype(str)

# Compute the monthly rainfall sum per location
monthly_rain_sum = complete_dataset.groupby(['Source', 'year_month'])['RAIN_f'].sum().reset_index()
monthly_rain_sum.rename(columns={'RAIN_f': 'RAIN_f_monthly_sum'}, inplace=True)

# Compute the weekly rainfall sum per location
weekly_rain_sum = complete_dataset.groupby(['Source', 'year_week'])['RAIN_f'].sum().reset_index()
weekly_rain_sum.rename(columns={'RAIN_f': 'RAIN_f_weekly_sum'}, inplace=True)

# Replace values above 1000 with NaN in monthly rainfall only
monthly_rain_sum.loc[monthly_rain_sum['RAIN_f_monthly_sum'] > 1000, 'RAIN_f_monthly_sum'] = pd.NA
weekly_rain_sum.loc[weekly_rain_sum['RAIN_f_weekly_sum'] > 1000, 'RAIN_f_weekly_sum'] = pd.NA

# Merge back into the original dataset while ensuring no duplicates
if 'RAIN_f_monthly_sum' not in complete_dataset.columns:
    complete_dataset = complete_dataset.merge(monthly_rain_sum, on=['Source', 'year_month'], how='left')

if 'RAIN_f_weekly_sum' not in complete_dataset.columns:
    complete_dataset = complete_dataset.merge(weekly_rain_sum, on=['Source', 'year_week'], how='left')

#### Perform temporal aggregation 

In [5]:
# List of columns to leave unchanged
columns_to_keep = ['Source', 'date', 'DOY', 'year_month', 'year_week', 'RAIN_f_monthly_sum', 'RAIN_f_weekly_sum']

# Define aggregation methods
agg_dict = {col: 'mean' for col in complete_dataset.columns if col not in columns_to_keep}
agg_dict.update({
    'DOY': 'first',  # Keep the first occurrence
    'year_month': 'first',  # Keep the first occurrence
    'year_week': 'first',  # Keep the first occurrence
    'RAIN_f_monthly_sum': 'first',  # Monthly sum should remain unchanged
    'RAIN_f_weekly_sum': 'first'  # Weekly sum should remain unchanged
})

# Perform the aggregation and reset index
daily_avg_df = complete_dataset.groupby(['Source', 'date'], as_index=False).agg(agg_dict).reset_index(drop=True)

#### Reorder the order of the columns in the dataframe

In [6]:
# Drop the 'datetime' column if it exists
daily_avg_df = daily_avg_df.drop(columns=['datetime'], errors='ignore')

# Define the desired order for the first few columns
first_columns = ['date', 'year_month', 'year_week', 'DOY', 'Source']

# Ensure first_columns are in the dataset and maintain their order
first_columns = [col for col in first_columns if col in daily_avg_df.columns]

# Find the position of 'RAIN_f'
rain_f_index = daily_avg_df.columns.get_loc('RAIN_f')

# Get the current column order and remove 'RAIN_f_monthly_sum' & 'RAIN_f_weekly_sum' if they exist
columns = list(daily_avg_df.columns)
columns.remove('RAIN_f_monthly_sum') if 'RAIN_f_monthly_sum' in columns else None
columns.remove('RAIN_f_weekly_sum') if 'RAIN_f_weekly_sum' in columns else None

# Insert 'RAIN_f_monthly_sum' and 'RAIN_f_weekly_sum' right after 'RAIN_f'
columns[rain_f_index + 1:rain_f_index + 1] = ['RAIN_f_monthly_sum', 'RAIN_f_weekly_sum']

# Ensure first columns are placed at the beginning
remaining_columns = [col for col in columns if col not in first_columns]
final_order = first_columns + remaining_columns

# Reorder the DataFrame
daily_avg_df = daily_avg_df[final_order]

In [7]:
print(daily_avg_df.info()) 
#daily_avg_df.head(15)
#daily_avg_df.tail(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8870 entries, 0 to 8869
Data columns (total 74 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   date                         8870 non-null   datetime64[ns]
 1   year_month                   8870 non-null   object        
 2   year_week                    8870 non-null   object        
 3   DOY                          8870 non-null   float64       
 4   Source                       8870 non-null   object        
 5   SWCT_1_005                   8240 non-null   float64       
 6   SWCT_1_015                   8529 non-null   float64       
 7   SWCT_1_025                   8530 non-null   float64       
 8   SWCT_1_035                   8527 non-null   float64       
 9   SWCT_1_045                   8531 non-null   float64       
 10  SWCT_1_055                   8529 non-null   float64       
 11  SWCT_1_065                   8531 non-null 

#### Export the final dataframe to a CSV file

In [8]:
output_path = "C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V4.csv"  # Update the path as needed
daily_avg_df.to_csv(output_path, index=False)

print(f"DataFrame successfully saved to {output_path}")

DataFrame successfully saved to C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V4.csv
