In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import polars as pl
import polars.selectors as cs
from typing import get_args

from data import load_data
from config import Species, TARGET, FEATURES_METADATA

# Load data for the given species
df = pl.concat(
    [load_data(species) for species in get_args(Species)], how="vertical_relaxed"
)

In [None]:
# Total number of rows and rows with defoliation / soil solution data
print("Number of rows in total:", df.height)
height = df.filter(pl.any_horizontal(cs.starts_with("dep_").is_not_null())).height
print("Number of rows with defoliation data:", height)
height = df.filter(pl.any_horizontal(cs.starts_with("ss_").is_not_null())).height
print("Number of rows with soil solution data:", height)

# Total number of trees, plots in total and species
species = df.select(pl.col("species").unique()).to_series()

print("Number of unique trees:", df.select(pl.col("tree_id").n_unique()).item())
print("Number of unique plots:", df.select(pl.col("plot_id").n_unique()).item())
for sp in species:
    n_trees = (
        df.filter(pl.col("species") == sp).select(pl.col("tree_id").n_unique()).item()
    )
    n_plots = (
        df.filter(pl.col("species") == sp).select(pl.col("plot_id").n_unique()).item()
    )
    print(f"- {sp.capitalize()}: {n_trees} trees, {n_plots} plots")

In [None]:
# Histogram of growth
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))

sns.histplot(
    df.select(pl.col("growth_rate_rel") * 100, "species"),
    x="growth_rate_rel",
    bins=50,
    ax=ax,
    hue="species",
    element="step",
    stat="density",
    common_norm=False,
    fill=False,
)
ax.set_title("Growth rate distribution by species")
ax.set_xlabel("Relative growth rate [%/year]")
ax.set_ylabel("Frequency")
plt.show()

In [None]:
with pl.Config(
    tbl_formatting="MARKDOWN",
    tbl_hide_column_data_types=True,
    tbl_rows=-1,
    tbl_width_chars=200,
    fmt_str_lengths=200,
) as cfg:
    print(
        pl.from_dicts(
            [
                {**{"feature": feature}, **descr}
                for feature, descr in FEATURES_METADATA.items()
            ]
        )
    )

In [None]:
# Compare absolute growth, growth rate, and relative growth rate (to diameter)
df.select("growth", "growth_rate", "growth_rate_rel").describe()

In [None]:
# Check that data contains all features
missing_features = set(FEATURES_METADATA.keys()) - set(df.columns)
if missing_features:
    raise ValueError(
        f"Data is missing the following features: {missing_features}. "
        "Please check the data loading process."
    )

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of the number of trees per plot
print("Distribution of the number of trees per plot:")
num_trees = df.group_by("plot_id").agg(pl.count("tree_id").alias("num_trees"))

print(f"# min = {num_trees['num_trees'].min()}")
print(f"# max = {num_trees['num_trees'].max()}")
print(f"# mean = {num_trees['num_trees'].mean()}")
print(f"# of single-tree plots = {len(num_trees.filter(num_trees['num_trees'] == 1))}")

_ = sns.histplot(num_trees["num_trees"], bins=20)
plt.xlabel("# of trees")
plt.ylabel("# of plots")
plt.title("Distribution of the number of trees per plot")

In [None]:
# Plot box plots of target variable by plot_id

# Keep only plots with at least 10 trees
data = df.with_columns(
    pl.col("tree_id").n_unique().over("plot_id").alias("num_trees")
).filter(pl.col("num_trees") >= 100)
sns.boxplot(x="plot_id", y=TARGET, data=data.to_pandas())
plt.xlabel("Plot ID")

# Vertical label for x-axis
_ = plt.xticks(rotation=90)

In [None]:
# Plot distribution of trees of latitude and longitude
plt.figure(figsize=(5, 4))
sns.histplot(df["plot_latitude"], bins=20)
plt.xlabel("Latitude")
plt.ylabel("# of trees")
plt.title("Distribution of latitude")

# Plot distribution of trees of altitudes
plt.figure(figsize=(5, 4))
sns.histplot(df["plot_altitude"], bins=20)
plt.xlabel("Altitude")
plt.ylabel("# of trees")
plt.title("Distribution of altitude")

# Plot distributions of trees across plot orientation for each species
plt.figure(figsize=(5, 4))
sns.histplot(
    data=df.to_pandas(),
    x="plot_orientation",
    bins=20,
    hue="species",
    multiple="stack",
    stat="count",
)
plt.xlabel("Orientation")
plt.ylabel("# of trees")
plt.title("Distribution of plot orientation")
plt.xticks(rotation=90)

In [None]:
# plot distribution of plots across orientations
plt.figure(figsize=(5, 4))
sns.histplot(
    data=df.group_by("species", "plot_id").agg(
        pl.first("plot_orientation").alias("plot_orientation")
    ),
    x="plot_orientation",
    bins=20,
    hue="species",
    multiple="dodge",
    shrink=0.8,
)
plt.xlabel("Orientation")
plt.ylabel("# of plots")
plt.title("Distribution of plots across orientations")
plt.xticks(rotation=90)

In [None]:
df.filter(pl.col("species") == "oak").filter(
    pl.col("dep_ph").is_between(5.25, 5.33)
).select(
    pl.col("plot_id"),
    pl.col("country"),
    pl.col("plot_latitude"),
    pl.col("plot_longitude"),
).unique()

In [None]:
df.filter(pl.col("species") == "oak").filter(pl.col("dep_ph") > 5.33).select(
    pl.col("growth_rate_rel").mean(), pl.col("plot_id").n_unique()
)