Arborcheck Data Reading And Cleaning

In [9]:
import csv
import re
from pathlib import Path
from datetime import datetime

NUM_TOKEN = re.compile(r"^([+-]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)(?:/\d+)?$")

def extract_date_from_name(path: Path) -> str:
    stem = path.stem
    parts = stem.split('-')
    if len(parts) < 5:
        raise ValueError(f"Unexpected filename format: {path.name}")
    day = parts[2]
    month = parts[3]
    year = parts[4].split('_')[0]
    dt = datetime.strptime(f"{day}-{month}-{year}", "%d-%m-%y")
    return dt.strftime("%d/%m/%Y")

def parse_metadata(line: str) -> dict:
    parts = [p.strip().strip('"') for p in line.split(",")]
    md = {}
    md["TreeID"]   = parts[0] if len(parts) > 0 else ""
    md["MD_Day"]   = parts[1] if len(parts) > 1 else ""
    md["MD_Month"] = parts[2] if len(parts) > 2 else ""
    md["MD_Year"]  = parts[3] if len(parts) > 3 else ""
    md["MD_Hour"]  = parts[4] if len(parts) > 4 else ""
    md["MD_Min"]   = parts[5] if len(parts) > 5 else ""
    md["Genus"]    = parts[6] if len(parts) > 6 else ""
    md["Species"]  = parts[7] if len(parts) > 7 else ""
    return md

def parse_res_file(res_path: Path):
    date_str = extract_date_from_name(res_path)
    with res_path.open("r", encoding="utf-8") as f:
        lines = [ln.rstrip("\n").strip() for ln in f if ln.strip()]
    middle = lines[1:-1]
    values_only, meta = [], {}
    for ln in middle:
        parts = [p.strip() for p in ln.split(",")]
        if len(parts) == 2:
            m1 = NUM_TOKEN.match(parts[0])
            m2 = NUM_TOKEN.match(parts[1])
            if m1 and m2:
                values_only.append(m1.group(1))
                continue
        if len(parts) >= 8 and not meta:
            meta = parse_metadata(ln)
    return values_only, meta, date_str

VALUE_NAME_MAP = {
    1: "Efficiency",
    2: "Chlorophyll",
    3: "Si1",
    4: "Si2",
    5: "Si3",
    6: "Si4",
}

def build_headers_values(n_values: int):
    headers = ["Date", "BaumID", "BaumArt"]
    for i in range(1, n_values + 1):
        headers.append(VALUE_NAME_MAP.get(i, f"Val{i}"))
    return headers

def row_from(values_only, date_str, meta, max_values):
    pad = [""] * (max_values - len(values_only))
    baum_id = meta.get("TreeID", "")
    baum_art = (" ".join([meta.get("Genus","").strip(), meta.get("Species","").strip()])).strip()
    return [date_str, baum_id, baum_art] + values_only + pad

def convert_one(res_file: str, out_csv: str):
    values_only, meta, date_str = parse_res_file(Path(res_file))
    header = build_headers_values(len(values_only))
    row = row_from(values_only, date_str, meta, len(values_only))
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(header)
        w.writerow(row)
    print(f"Saved 1 row to {out_csv}")

def convert_folder(res_folder: str, out_csv: str):
    folder = Path(res_folder)
    rows, max_vals = [], 0
    for p in sorted(folder.glob("*.res")):
        values_only, meta, date_str = parse_res_file(p)
        max_vals = max(max_vals, len(values_only))
        rows.append((date_str, values_only, meta))

    header = build_headers_values(max_vals)
    out_rows = [row_from(values_only, date_str, meta, max_vals)
                for (date_str, values_only, meta) in rows]

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(header)
        w.writerows(out_rows)
    print(f"Saved {len(out_rows)} rows to {out_csv}")


convert_folder("/content", "/content/sample_data/all_res_combined.csv")


Saved 64 rows to /content/sample_data/all_res_combined.csv


Arborcheck Data Tranformation

In [11]:
import pandas as pd
import numpy as np


input_csv = "/content/sample_data/all_res_combined.csv"   # e.g. "drive/MyDrive/thesis_data.csv"
output_csv = "/content/sample_data/Arborcheck_with_categories.csv"

# -----------------------------
# Vitality (Efficiency & Chlorophyll)
def vitality_category(efficiency: float, chlorophyll: float) -> str:
    if pd.isna(efficiency):
        return None

    if efficiency < -4.0:
        cat = "Critical reduction"
    elif efficiency < -3.0:
        cat = "Significant reduction"
    elif efficiency < -2.0:
        cat = "Slight reduction"
    else:
        cat = "Good"

    # Chlorophyll refinement
    if not pd.isna(chlorophyll) and chlorophyll < -2.5 and cat == "Good":
        cat = "Slight reduction"

    return cat


# -----------------------------
# Stress (Si1..Si4)
def _si_band(si: float, idx: int) -> int:
    if pd.isna(si):
        return 0
    if idx == 1:
        if si <= -2.5:
            return 3
        elif si <= -1.0:
            return 2
        elif si <= 0.0:
            return 1
        else:
            return 1
    else:
        m = abs(si)
        if m > 2.5:
            return 3
        elif m >= 1.0:
            return 2
        else:
            return 1

def stress_category(si1: float, si2: float, si3: float, si4: float) -> str:
    bands = [
        _si_band(si1, 1),
        _si_band(si2, 2),
        _si_band(si3, 3),
        _si_band(si4, 4),
    ]
    worst = max(bands)
    mean_si = np.nanmean([si1, si2, si3, si4])

    if worst == 3:
        return "Severe"
    elif worst == 2:
        return "Moderate" if (not np.isnan(mean_si) and abs(mean_si) >= 1.5) else "Mild"
    else:
        if np.isnan(mean_si) or abs(mean_si) < 0.5:
            return "No Stress"
        else:
            return "Mild"


# -----------------------------
# Main workflow
# -----------------------------

# 🔹 Give your input CSV path here


df = pd.read_csv(input_csv)

# Normalize column name (in case "Genus " has a trailing space)
if "Genus " in df.columns and "Genus" not in df.columns:
    df = df.rename(columns={"Genus ": "Genus"})

# Add new categories
df["VitalityCategory"] = [
    vitality_category(e, c) for e, c in zip(df["Efficiency"], df["Chlorophyll"])
]
df["StressCategory"] = [
    stress_category(a, b, c, d) for a, b, c, d in zip(df["Si1"], df["Si2"], df["Si3"], df["Si4"])
]

# Export
df.to_csv(output_csv, index=False)
print("Exported with categories ->", output_csv)

# Preview
df.head()


Exported with categories -> /content/sample_data/Arborcheck_with_categories.csv


Unnamed: 0,Date,BaumID,BaumArt,Efficiency,Chlorophyll,Si1,Si2,Si3,Si4,VitalityCategory,StressCategory
0,13/08/2025,444.0,Ilex ('Nellie Stevens'),0.0,0.0,0.0,-1.325158,-1.03713,-1.091668,Good,Mild
1,14/08/2025,444.0,Ilex aquifolium,0.0,0.0,0.0,-0.785857,-0.496203,-0.866566,Good,Mild
2,15/08/2025,444.0,Ilex aquifolium,0.0,0.0,0.0,-1.444538,-0.889301,-2.038358,Good,Mild
3,16/08/2025,444.0,Ilex aquifolium,0.0,0.0,0.0,-0.421391,-0.276731,0.832987,Good,No Stress
4,17/08/2025,444.0,Ilex aquifolium,0.0,0.0,0.0,-0.437503,-0.140664,-0.764343,Good,No Stress


Extract data Planet and SAR data for the Trees in Google Earth Engine

Data Merging with Planet and SAR data

In [None]:
# 🔹 Give your input CSV paths here
csv_path_1 = "/content/FraOst_Planet_Tree_Data.csv"  # Replace with the actual path to your first CSV
csv_path_2 = "/content/Arborcheck_with_categories.csv"  # Replace with the actual path to your second CSV
output_merged_csv = "Final_Data_For_Feature_Engineering.xlsx"

# Read the CSV files into pandas DataFrames
df1 = pd.read_csv(csv_path_1)
df2 = pd.read_csv(csv_path_2)

# Ensure the Date columns are in datetime format for accurate merging
df1['Date'] = pd.to_datetime(df1['Date'], format='%d/%m/%Y') # Adjust format if needed
df2['Date'] = pd.to_datetime(df2['Date'], format='%d/%m/%Y') # Adjust format if needed

# Merge the dataframes on 'BaumID' and 'Date'
merged_df = pd.merge(df1, df2, on=['BaumID', 'Date'], how='inner') # Use 'inner' to keep only matching rows

# Export the merged dataframe to a new CSV file
merged_df.to_excel(output_merged_csv, index=False)

print("Merged data exported to:", output_merged_csv)

# Preview the merged dataframe
display(merged_df.head())

Merged data exported to: merged_data.xlsx


Unnamed: 0,Date,BaumID,FID,B1,B2,B3,B4,B5,B6,B7,...,Species,Efficiency,Chlorophyll,Si1,Si2,Si3,Si4,Interpretation Guide,VitalityCategory,StressCategory
0,2025-08-14,444,5,324.790179,315.330357,486.921131,559.465774,570.208333,470.489583,833.28125,...,aquifolium,0.0,0.0,0.0,-0.8,-0.5,-0.9,"Compared to the DBV, this tree shows good over...",Good,Mild
1,2025-08-15,444,5,297.495536,365.254464,505.120536,535.238095,560.581845,510.300595,856.178571,...,aquifolium,0.0,0.0,0.0,-1.4,-0.9,-2.0,"Compared to the DBV, this tree shows good over...",Good,Mild
2,2025-08-16,444,5,458.433036,473.422619,579.303571,587.599702,577.782738,564.464286,933.523809,...,aquifolium,0.0,0.0,0.0,-0.4,-0.3,0.8,"Compared to the DBV, this tree shows good over...",Good,No Stress
3,2025-08-17,444,5,286.166667,268.059524,454.342262,524.590774,490.357143,468.080357,848.732143,...,aquifolium,0.0,0.0,0.0,-0.4,-0.1,-0.8,"Compared to the DBV, this tree shows good over...",Good,No Stress
4,2025-08-18,444,5,245.675595,291.501488,434.13244,479.364583,415.897321,389.005952,800.571429,...,aquifolium,0.0,0.0,0.0,-0.4,-0.2,-0.8,"Compared to the DBV, this tree shows good over...",Good,No Stress
