In [47]:
%load_ext autoreload
%autoreload 2

import polars as pl

In [91]:
# We first load the data using Polars
with pl.StringCache():
    df_growth = pl.read_parquet("./data/raw/icpf-level2_growth.parquet")
    df_plots = pl.read_parquet("./data/raw/icpf-level2_plot-info.parquet")
    df_crown = pl.read_parquet("./data/raw/icpf-level2_crown-conditions.parquet")

print(f"Number of rows: {df_growth.height}")


assert "diameter" in df_growth.columns, (
    "The 'diameter' column is missing from the DataFrame."
)

# Drop all rows with null values in the 'diameter' column
df_growth = df_growth.drop_nulls(subset="diameter")
print(f"Number of rows after dropping nulls: {df_growth.height}")

# Drop all rows where country is 'Belgium' or 'Spain'
df_growth = df_growth.filter(~pl.col("country").is_in(["Belgium", "Spain"]))
print(f"Number of rows after dropping Belgium and Spain: {df_growth.height}")

# Drop all rows with:
# - diameter_quality_code is larger than 2 (implausible, https://icp-forests.org/documentation/Dictionaries/d_gr_quality_code.html)
# - or diameter_method_code is in [7] (estimated diameter, https://icp-forests.org/documentation/Dictionaries/d_diameter.html)
# - or removal_code is larger than 10 (dead tree, see https://icp-forests.org/documentation/Dictionaries/d_removal_mortality_ccgr.html)
# Keep null values in 'diameter_quality_code', 'diameter_method_code', and 'removal_code' for now

df_growth = df_growth.filter(
    pl.col("diameter_quality_code").is_null() | ~pl.col("diameter_quality_code").gt(2)
)
df_growth = df_growth.filter(
    pl.col("diameter_method_code").is_null()
    | ~pl.col("diameter_method_code").is_in([7])
)
df_growth = df_growth.filter(
    pl.col("removal_code").is_null() | ~pl.col("removal_code").gt(10)
)

print(f"Number of rows after dropping quality codes 3-9: {df_growth.height}")

# Drop rows with negative or zero diameter values
df_growth = df_growth.filter(pl.col("diameter").gt(0))
print(f"Number of rows after dropping negative diameters: {df_growth.height}")

# Compute the growth rate and relative growth rate between two censuses
df_growth = (
    df_growth.sort(by=["country_code", "plot_code", "tree_number", "date"])
    .with_columns(
        period_start=pl.col("date")
        .shift(1)
        .over("country", "plot_code", "tree_number"),
        period_end=pl.col("date"),
        diameter_start=pl.col("diameter")
        .shift(1)
        .over("country", "plot_code", "tree_number"),
        diameter_end=pl.col("diameter"),
        diameter_method_code_start=pl.col("diameter_method_code")
        .shift(1)
        .over("country", "plot_code", "tree_number"),
        diameter_method_code_end=pl.col("diameter_method_code"),
    )
    .with_columns(
        period_duration=pl.col("period_end") - pl.col("period_start"),
        growth=pl.col("diameter_end") - pl.col("diameter_start"),
    )
    .with_columns(
        period_duration_d=pl.col("period_duration").dt.total_days(),
        period_duration_y=pl.col("period_duration").dt.total_days() / 365.25,
    )
    .with_columns(
        growth_rate=pl.col("growth") / pl.col("period_duration_y"),
        growth_rel=pl.col("growth") / pl.col("diameter_start"),
    )
    .with_columns(
        growth_rate_rel=pl.col("growth_rel") / pl.col("period_duration_y"),
    )
    .select(
        "survey_year",
        "tree_id",
        "plot_id",
        "country_code",
        "country",
        "tree_species_code",
        "specie",
        "plot_code",
        "tree_number",
        "period_start",
        "period_end",
        "diameter_start",
        "diameter_end",
        "period_duration_d",
        "period_duration_y",
        "growth",
        "growth_rate",
        "growth_rel",
        "growth_rate_rel",
        "diameter_method_code_start",
        "diameter_method_code_end",
        pl.col("removal_code").alias("removal_code_end"),
        pl.col("diameter_quality").alias("diameter_quality_end"),
        pl.col("diameter_method").alias("diameter_method_end"),
        pl.col("removal_info").alias("removal_info_end"),
    )
    .drop_nulls(subset=["period_duration_y"])
)
print(f"Number of rows after computing growth rates: {df_growth.height}")

# Keep only growth periods between 4 and 6 yesrs included
df_growth = df_growth.filter(pl.col("period_duration_y").is_between(4.0, 6.0))
print(
    f"Number of rows after filtering growth periods between 4 and 6 years: {df_growth.height}"
)

# Keep only relative growth rates between 0 and 0.1 to avoid dead trees and implausible growth rates
df_growth = df_growth.filter(pl.col("growth_rate_rel").is_between(0, 0.1))
print(
    f"Number of rows after filtering relative growth rates between 0 and 0.1: {df_growth.height}"
)

Number of rows: 626456
Number of rows after dropping nulls: 605159
Number of rows after dropping Belgium and Spain: 549800
Number of rows after dropping quality codes 3-9: 525010
Number of rows after dropping negative diameters: 524956
Number of rows after computing growth rates: 280967
Number of rows after filtering growth periods between 4 and 6 years: 150278
Number of rows after filtering relative growth rates between 0 and 0.1: 142926


In [None]:
# Join to plot information
PLOT_COLS = [
    "plot_latitude",
    "plot_longitude",
    "plot_slope",
    "plot_orientation",
    "plot_altitude",
]

df_growth = df_growth.join(
    df_plots.select("plot_id", *PLOT_COLS),
    on="plot_id",
    how="left",
)

print(f"Number of rows after joining with plot information: {df_growth.height}")

Number of rows after joining with plot information: 142926


In [None]:
# Prepare the crown condition data
print(f"Number of rows in crown condition data: {df_crown.height}")

# Dropping all rows with null or negative defoliation
df_crown = df_crown.filter(
    pl.col("defoliation").is_not_null() & pl.col("defoliation").ge(0)
).with_columns(defoliation=pl.col("defoliation").cast(pl.Int32))
print(f"Number of rows with valid defoliation: {df_crown.height}")

# Merge the crown condition data with the growth data
df = (
    df_crown.sort(by="date")
    .join_asof(
        df_growth.with_columns(date=pl.col("period_end")).sort(by="date"),
        by=["tree_id"],
        on="date",
        strategy="forward",
        suffix="_gp",
    )
    .filter(pl.col("date").is_between(pl.col("period_start"), pl.col("period_end")))
    .drop_nulls(subset="period_end")
)
print(f"Number of rows after merging crown condition data: {df_growth.height}")

Number of rows in crown condition data: 740311
Number of rows with valid defoliation: 740311
Number of rows after merging crown condition data: 142926


In [None]:
df.sort("tree_id", "date").slice(12000, 10).select(
    "survey_year",
    "tree_id",
    "defoliation",
    "date",
    "period_start",
    "period_end",
    "growth_rate",
)

survey_year,tree_id,defoliation,date,period_start,period_end,growth_rate
i64,str,i32,datetime[ns],datetime[ns],datetime[ns],f64
2005,"""04.0705.00023""",60,2005-07-14 00:00:00,2004-12-07 00:00:00,2009-11-19 00:00:00,0.256564
2006,"""04.0705.00023""",45,2006-07-18 00:00:00,2004-12-07 00:00:00,2009-11-19 00:00:00,0.256564
2007,"""04.0705.00023""",40,2007-07-27 00:00:00,2004-12-07 00:00:00,2009-11-19 00:00:00,0.256564
2008,"""04.0705.00023""",55,2008-07-21 00:00:00,2004-12-07 00:00:00,2009-11-19 00:00:00,0.256564
2009,"""04.0705.00023""",50,2009-07-20 00:00:00,2004-12-07 00:00:00,2009-11-19 00:00:00,0.256564
2010,"""04.0705.00023""",45,2010-07-15 00:00:00,2009-11-19 00:00:00,2014-11-28 00:00:00,0.25677
2011,"""04.0705.00023""",45,2011-07-27 00:00:00,2009-11-19 00:00:00,2014-11-28 00:00:00,0.25677
2012,"""04.0705.00023""",40,2012-08-03 00:00:00,2009-11-19 00:00:00,2014-11-28 00:00:00,0.25677
2013,"""04.0705.00023""",45,2013-07-23 00:00:00,2009-11-19 00:00:00,2014-11-28 00:00:00,0.25677
2014,"""04.0705.00023""",30,2014-07-15 00:00:00,2009-11-19 00:00:00,2014-11-28 00:00:00,0.25677
