**Script Description:** This script merges all the individual CSV files of the NOBV EC Tower Locations into one dataframe, and exports the merged CSV file

**File Name:** 01_01_Filter_Merge_EC_Tower_Data.ipynb

**Date:** 2025

**Created by:** Rob Alamgir

#### Import the relevant packages

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#### Extract and Merge data from all the indivdual CSV files 

In [2]:
directory_path = 'C:/Data_MSc_Thesis/EC_Tower_Data/Friesland_EC_Tower_Data'   # Specify the directory path
files = os.listdir(directory_path)                                            # Get a list of all files and directories in the specified directory

files = [f for f in files if os.path.isfile(os.path.join(directory_path, f))] # Filter out directories and only list files
files_with_data = []                                                          # Initialize lists to store files with data and without data
files_without_data = []
data_list = []                                                                # Initialize an empty list to store the data

# Loop through each file
for file in files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path, low_memory=False)                            # Read the CSV file with all columns
    # Check if there is any data in the DataFrame
    if not df.empty:
        df['location'] = file                                                # Add the file name as a new column
        data_list.append(df)                                                 # Append the full DataFrame to the list
        files_with_data.append(file)
    else:
        files_without_data.append(file)                                      # If the file is empty, add it to files_without_data
# Combine all data into a single DataFrame if there's any data
if data_list:
    Complete_df = pd.concat(data_list, ignore_index=True)                       # Combine all the data into a single DataFrame
    print("Data successfully extracted!")
else:
    print("No data available to merge.")

print("\nFiles with data:")
print(files_with_data)
print("\nFiles without data:")
print(files_without_data)
print(Complete_df.info()) 

Data successfully extracted!

Files with data:
['ALB_MS.csv', 'ALB_RF.csv', 'AMM.csv', 'AMR.csv', 'BUO.csv', 'BUW.csv', 'HOC.csv', 'HOH.csv', 'LDC.csv', 'LDH.csv']

Files without data:
[]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425309 entries, 0 to 425308
Columns: 463 entries, datetime to WTMP_sloot
dtypes: float64(180), object(283)
memory usage: 1.5+ GB
None


#### Quickly check basic information about all the columns in the dataframe

In [3]:
# Check the number of rows and columns
print("\nNumber of Rows and Columns:")
print(f"Rows: {Complete_df.shape[0]}")
print(f"Columns: {Complete_df.shape[1]}")

# Iterate over each column in the DataFrame
for column in Complete_df.columns:
    column_name = column                       # Name of the column
    column_unit = Complete_df[column].iloc[0]  # Unit from the first row
    column_dtype = Complete_df[column].dtype   # Data type of the column  
    print(f"Column: {column_name}")
    print(f"  Unit: {column_unit}")
    print(f"  Data Type: {column_dtype}")
    print("-" * 35)


Number of Rows and Columns:
Rows: 425309
Columns: 463
Column: datetime
  Unit: yyyy-mm-dd HH:MM
  Data Type: object
-----------------------------------
Column: DOY
  Unit: [ddd.ddd]
  Data Type: object
-----------------------------------
Column: VPD_f
  Unit: [Pa]
  Data Type: object
-----------------------------------
Column: station
  Unit: nan
  Data Type: object
-----------------------------------
Column: filename
  Unit: nan
  Data Type: object
-----------------------------------
Column: date
  Unit: [yyyy-mm-dd]
  Data Type: object
-----------------------------------
Column: time
  Unit: [HH:MM]
  Data Type: object
-----------------------------------
Column: daytime
  Unit: [1=daytime]
  Data Type: object
-----------------------------------
Column: file_records
  Unit: [#]
  Data Type: object
-----------------------------------
Column: used_records
  Unit: [#]
  Data Type: object
-----------------------------------
Column: Tau
  Unit: [kg+1m-1s-2]
  Data Type: object
-----------

In [None]:
#Complete_df.head(20)
#final_df.tail(5)

#### Extract and Merge data from all the indivdual CSV files and only of specific defined columns

In [4]:
directory_path = 'C:/Data_MSc_Thesis/EC_Tower_Data/Friesland_EC_Tower_Data'  # Specify the directory path
files = os.listdir(directory_path)  # Get a list of all files and directories in the specified directory
files = [f for f in files if os.path.isfile(os.path.join(directory_path, f))]  # Filter out directories and only list files

files_with_data = []  # Initialize lists to store files with and without required columns
files_without_data = []
data_list = []  # Initialize an empty list to store the data

# Define the list of required columns
required_columns = [
    'datetime','DOY', 'daytime',
    'SWCT_1_005', 'SWCT_1_015', 'SWCT_1_025', 'SWCT_1_035', 'SWCT_1_045', 
    'SWCT_1_055', 'SWCT_1_065', 'SWCT_1_075', 'SWCT_1_085', 'SWCT_1_095', 
    'SWCT_1_105', 'SWCT_1_115',
    'STMP_1_005', 'STMP_1_015', 'STMP_1_025', 'STMP_1_035', 'STMP_1_045', 
    'STMP_1_055', 'STMP_1_065', 'STMP_1_075', 'STMP_1_085', 'STMP_1_095', 
    'STMP_1_105', 'STMP_1_115',
    'WLEV_f', 'WTMP_f', 'ATMP_f', 'PAIR_f', 'WIND_f', 'WINS_f', 'RHUM_f', 
    'RAIN_f', 'VPD_f', 'SWIN_f', 'ET', 'NEE_CO2', 'NEE_CH4'
]

# Loop through each file
for file in files:
    file_path = os.path.join(directory_path, file)
    df = pd.read_csv(file_path, low_memory=False)  # Read the CSV file

    # Check if all required columns are present
    if set(required_columns).issubset(df.columns):
        df_filtered = df[required_columns].copy()  # Keep only required columns
        df_filtered['Source'] = file  # Add the file name as a new column
        data_list.append(df_filtered)
        files_with_data.append(file)
    else:
        files_without_data.append(file)

# Merge the data if available
if data_list:
    final_df = pd.concat(data_list, ignore_index=True)  
    print("Data successfully extracted & merged.")
else:
    print("No data available to merge.")   

# Print summary of files
print("\nFiles with required columns:")
print(files_with_data)
print("\nFiles missing required columns:")
print(files_without_data)
print(final_df.info()) 

Data successfully extracted & merged.

Files with required columns:
['ALB_MS.csv', 'ALB_RF.csv', 'AMM.csv', 'AMR.csv', 'BUO.csv', 'BUW.csv', 'HOC.csv', 'HOH.csv', 'LDC.csv', 'LDH.csv']

Files missing required columns:
[]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425309 entries, 0 to 425308
Data columns (total 41 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   datetime    425309 non-null  object 
 1   DOY         425201 non-null  object 
 2   daytime     141705 non-null  object 
 3   SWCT_1_005  390173 non-null  float64
 4   SWCT_1_015  405103 non-null  object 
 5   SWCT_1_025  405251 non-null  object 
 6   SWCT_1_035  405140 non-null  float64
 7   SWCT_1_045  405448 non-null  object 
 8   SWCT_1_055  405289 non-null  float64
 9   SWCT_1_065  405443 non-null  object 
 10  SWCT_1_075  405565 non-null  float64
 11  SWCT_1_085  405564 non-null  float64
 12  SWCT_1_095  391620 non-null  float64
 13  SWCT_1_105  405528 non-null  floa

In [None]:
#final_df.head(15)
#final_df.tail(15)

#### Dataframe pre-processing

In [5]:
final_df = final_df.iloc[1:].reset_index(drop=True)                          # Drop the first row (unit row) and reset index
final_df['datetime'] = pd.to_datetime(final_df['datetime'], errors='coerce') # Convert 'datetime' to proper datetime format
exclude_cols = ['Source', 'datetime', 'DOY', 'daytime']                      # Exclude 'Source', 'datetime', 'DOY', and 'daytime' from conversion
numeric_cols = [col for col in final_df.columns if col not in exclude_cols]

# Convert remaining object columns to float
for col in numeric_cols:
    final_df[col] = pd.to_numeric(final_df[col], errors='coerce')

# Move 'Source' column to be after 'daytime'
col_order = ['datetime', 'DOY', 'daytime', 'Source'] + [col for col in final_df.columns if col not in ['datetime', 'DOY', 'daytime', 'Source']]
final_df = final_df[col_order]

final_df['Source'] = final_df['Source'].str.replace('.csv', '', regex=False) # Remove '.csv' from 'Source' column
print(final_df.info())                                                       # Display final DataFrame info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425308 entries, 0 to 425307
Data columns (total 41 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   datetime    425299 non-null  datetime64[ns]
 1   DOY         425200 non-null  object        
 2   daytime     141704 non-null  object        
 3   Source      425308 non-null  object        
 4   SWCT_1_005  390173 non-null  float64       
 5   SWCT_1_015  405102 non-null  float64       
 6   SWCT_1_025  405250 non-null  float64       
 7   SWCT_1_035  405140 non-null  float64       
 8   SWCT_1_045  405447 non-null  float64       
 9   SWCT_1_055  405289 non-null  float64       
 10  SWCT_1_065  405442 non-null  float64       
 11  SWCT_1_075  405565 non-null  float64       
 12  SWCT_1_085  405564 non-null  float64       
 13  SWCT_1_095  391620 non-null  float64       
 14  SWCT_1_105  405528 non-null  float64       
 15  SWCT_1_115  405570 non-null  float64       
 16  ST

In [6]:
final_df.head(15)
#final_df.tail(5)

Unnamed: 0,datetime,DOY,daytime,Source,SWCT_1_005,SWCT_1_015,SWCT_1_025,SWCT_1_035,SWCT_1_045,SWCT_1_055,...,PAIR_f,WIND_f,WINS_f,RHUM_f,RAIN_f,VPD_f,SWIN_f,ET,NEE_CO2,NEE_CH4
0,2022-01-05 12:00:00,5,1.0,ALB_MS,55.65,51.47,50.47,51.53,52.48,52.53,...,1000.72,302.229,6.47057,92.4,0.1,171.216,141.710667,0.015105,,
1,2022-01-05 12:30:00,5,1.0,ALB_MS,55.66,51.47,50.47,51.54,52.48,52.53,...,1001.29663,310.186,6.57779,93.4,0.0,213.7,212.652333,-0.011354,,
2,2022-01-05 13:00:00,5,1.0,ALB_MS,55.65,51.48,50.46,51.53,52.49,52.53,...,1001.80994,314.715,6.85965,90.7,0.0,142.449,88.554333,-0.010789,,
3,2022-01-05 13:30:00,5,1.0,ALB_MS,55.63,51.48,50.47,51.54,52.49,52.53,...,1002.37,310.469,5.61747,92.8,0.0,160.811,56.626667,-0.010218,-9.385266,
4,2022-01-05 14:00:00,5,1.0,ALB_MS,55.59,51.48,50.46,51.55,52.49,52.53,...,1003.0233,310.229,5.74322,92.6,0.0,144.582,50.11,0.010835,-3.781026,
5,2022-01-05 14:30:00,5,1.0,ALB_MS,55.55,51.48,50.46,51.55,52.5,52.53,...,1003.6433,306.902,5.15789,94.0,0.2,150.085,41.320333,0.006891,,
6,2022-01-05 15:00:00,5,1.0,ALB_MS,55.54,51.48,50.46,51.58,52.49,52.54,...,1004.37,310.364,4.09531,92.0,0.0,130.157,0.951333,-0.001661,,
7,2022-01-05 15:30:00,5,0.0,ALB_MS,55.54,51.48,50.45,51.57,52.51,52.54,...,1005.0833,308.95,4.17604,94.8,0.0,137.67,48.624,-0.000203,-2.284103,
8,2022-01-05 16:00:00,5,0.0,ALB_MS,55.54,51.48,50.45,51.59,52.51,52.54,...,1005.66327,308.046,4.90247,93.8,0.0,159.624,44.213333,0.000531,,
9,2022-01-05 16:30:00,5,0.0,ALB_MS,55.54,51.47,50.45,51.6,52.52,52.54,...,1006.24,311.409,5.24477,92.5,0.0,163.405,1.226,0.001754,,


#### Export the filtered dataset

In [7]:
# Export the final dataframe to a CSV file
output_path = "C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V1.csv"  # Update the path as needed
final_df.to_csv(output_path, index=False)

print(f"DataFrame successfully saved to {output_path}")

DataFrame successfully saved to C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V1.csv
