In [33]:
%load_ext autoreload
%autoreload 2

In [1]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
from data import load_data, cat_to_codes
from common import Species, FEATURES, CATEGORICAL_COLUMNS, TARGET

# Load data for the given species
SPECIES : Species = "spruce"

df = load_data(SPECIES)

print(f"Data shape: {df.shape}")
print(f"Data columns: {df.columns}")
# # Convert categorical features to codes
# df = cat_to_codes(df, CATEGORICAL_FEATURES)

In [None]:
# Basic statistics
print(f"Species = {SPECIES}")
print(f"# of records = {len(df)}")
print(f"# of countries = {df['country'].n_unique()}")
print(f"# of plots = {df['plot_id'].n_unique()}")
print(f"# of trees = {df['tree_id'].n_unique()}")

In [None]:
# Distribution of the number of trees per plot
print("Distribution of the number of trees per plot:")
num_trees = df.group_by("plot_id").agg(pl.count("tree_id").alias("num_trees"))

print(f"# min = {num_trees['num_trees'].min()}")
print(f"# max = {num_trees['num_trees'].max()}")
print(f"# mean = {num_trees['num_trees'].mean()}")
print(f"# of single-tree plots = {len(num_trees.filter(num_trees['num_trees'] == 1))}")

_ = sns.histplot(num_trees["num_trees"], bins=20)
plt.xlabel("# of trees")
plt.ylabel("# of plots")
plt.title("Distribution of the number of trees per plot")

In [None]:
# Plot box plots of target variable by plot_id

# Keep only plots with at least 10 trees
data = df.with_columns(
    pl.col("tree_id").n_unique().over("plot_id").alias("num_trees")
).filter(pl.col("num_trees") >= 100)
sns.boxplot(x="plot_id", y=TARGET, data=data.to_pandas())

In [None]:
df.columns

# Testing a linear model on defoliation features only

Here we test a linear model on defoliation features only.

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import KFold, GroupKFold, cross_validate
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

# Train a very simple linear model to predict growth rate from defoliation
X = df.select(
    "defoliation_mean", "defoliation_max", "defoliation_min", "defoliation_median"
).to_numpy()
y = df["growth_rate_rel"].to_numpy()

# Cross-validation
results = cross_validate(
    ElasticNet(alpha=0.1, l1_ratio=0.1),
    X,
    y,
    groups=df["plot_id"],
    cv=GroupKFold(n_splits=5),
    scoring="r2",
    return_estimator=True,
)

estimator = results["estimator"][0]
print(f"Intercept = {estimator.intercept_}")
print(f"Coefficients = {estimator.coef_}")

print(f"R2 = {results['test_score'].mean():.2f}")

y_true = y.copy()
y_pred = estimator.predict(X)

# Plot y_test vs y_pred
for fold in range(1):
    # Calculate R2
    r2 = r2_score(y_true, y_pred)

    # Order the results by y_true
    idx = np.argsort(y_true)

    plt.figure(figsize=(10, 5))
    plt.scatter(range(len(idx)), y_pred[idx], alpha=0.2)
    plt.plot(range(len(idx)), y_true[idx], color="red")

    plt.legend(["Prediction", "Actual"])

    plt.xlabel("Sample")
    plt.ylabel("Growth rate")
    plt.title(f"(species = {SPECIES}, R2 = {r2:.2f})")