In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
import sys
sys.path.append("..")

from component.script.project import Project
from component.script.dataset import Dataset

## Set user parameters

In [None]:
project_name = "nuevo3"

# Load the project from JSON
project = Project.load(project_name=project_name)


# Define dataset

In [None]:
dataset = Dataset(project=project)


In [None]:
# if not parameters passed, the function will show all the available variables
dataset.set_target()

In [None]:
dataset.set_target("forest_loss_2015_2020")

In [None]:
dataset.set_year(2020) # This will set the year for time-dependent variables
dataset.set_features(["towns", "roads_edge", "centros_poblados_dist" ,
 'forest_gfc',
 'forest_loss',
 'protected_area',
 'rivers',
 'roads',
 'slope',
 'subj',])

In [None]:
dataset.validate()

In [None]:

from component.script.sampling import Sampling, SamplingStrategy


sampling = Sampling(
    strategy = SamplingStrategy.random,
    n_samples = 10000,
    seed=33, # for reproducibility
)

In [None]:
model_identifier_name = "v1"
random_seed = 1

In [None]:
dataset.show()

In [None]:
dataset.to_dataframe(sampling=sampling)

## Training formula


In [None]:
from component.script.far_helpers import generate_patsy_formula

In [None]:
# user_formula = "I(1-fcc) + trial ~ scale(altitude) + scale(dist_edge) + scale(dist_river) + scale(dist_road) + scale(dist_town) + scale(slope) + C(pa)"
user_formula = None

In [None]:

calculated_formula = generate_patsy_formula(dataset)

In [None]:
if user_formula is None:
    training_formula = calculated_formula
elif user_formula is not None:
    training_formula = user_formula
training_formula


## Train glm based on period

In [None]:
import pickle

import pandas as pd
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from component.script.far_helpers import extract_variables


def train_glm_from_formula(
    formula: str,
    dataset_file: str,
    out_file: str = "glm_model.pickle",
    random_state: int = 42,
    solver: str = "lbfgs",
    max_iter: int = 1000,
):
    """
    Train a logistic regression model from a formula and a text file dataset.

    Preprocessing:
        - Reads CSV file
        - Drops rows with missing values
        - Adds 'trial' = 1
        - Filters dataset to only include variables used in the formula
        - Validates required columns exist

    Parameters:
        formula (str): Patsy-style formula (e.g., 'target ~ var1 + C(var2)')
        dataset_file (str): Path to input text file (CSV format)
        out_file (str): Path to save trained model via joblib
        random_state (int): Random seed for reproducibility
        solver (str): Solver for LogisticRegression
        max_iter (int): Maximum iterations

    Returns:
        dict: Dictionary with model, predictions, deviance, formula, and dataset shape
    """
    # Read the dataset from the text file
    print(f"ðŸ“Š Loading data from {dataset_file}")
    try:
        dataset = pd.read_csv(dataset_file)
    except Exception as e:
        raise ValueError(f"Failed to read dataset file: {e}")

    if dataset.empty:
        raise ValueError("Dataset is empty after loading.")

    # Apply required preprocessing
    print("ðŸ§¹ Preprocessing data: dropping missing values and adding 'trial' column...")
    dataset = dataset.dropna(axis=0)  # Drop any rows with NA
    # dataset = dataset.fillna(0)  # Fill na values from distance files
    dataset["trial"] = 1  # Add trial column as 1

    # Extract raw variable names used in the formula (ignoring I(), scale(), C())
    raw_variables = extract_variables(formula, "all")

    # Also ensure that `trial` and `cell` are present â€” these are often used as offsets or weights
    required_vars = raw_variables | {"trial"}

    # Check which required variables are missing from dataset
    missing_vars = [var for var in required_vars if var not in dataset.columns]

    if missing_vars:
        raise ValueError(f"Missing columns in dataset: {missing_vars}")

    # Now filter the dataset: keep only relevant columns
    try:
        dataset = dataset[list(required_vars)]
    except KeyError as e:
        raise ValueError(f"Failed to select columns from dataset: {e}")

    print(
        f"ðŸ’¾ Filtered dataset to {len(dataset.columns)} variables: {list(dataset.columns)}"
    )
    # Ensure consistent preprocessing
    print(formula, len(dataset))
    y, x = dmatrices(formula, data=dataset, NA_action="drop")
    # Debug: Confirm alignment
    if len(y) != len(x):
        raise ValueError(
            f"Inconsistent sample sizes after dmatrices: Y={len(y)}, X={len(x)}"
        )

    Y = y[:, 0]
    X = x
    # X = x[:, :-1]

    print(f"âœ… Data aligned: {len(Y)} samples for training")

    # Fit GLM (Logistic Regression)
    model = LogisticRegression(
        solver=solver, max_iter=max_iter, random_state=random_state, n_jobs=-1
    )
    model.fit(X, Y)

    # Predictions
    pred_proba = model.predict_proba(X)[:, 1]

    # Compute deviance (twice the log loss)
    deviance = 2 * log_loss(Y, pred_proba, normalize=False)

    # Save model metadata (pickle)

    model_data = {
        "model": model,
        "predictions": pred_proba,
        "deviance": deviance,
        "formula": formula,
        "dataset_shape": dataset.shape,
        "samples_path": dataset_file,
    }

    # Save model with pickle
    with open(out_file, "wb") as file:
        pickle.dump(model_data, file)

    print(f"âœ… GLM trained and saved to: {out_file}")

    return model_data


In [None]:
def get_samples_for_period(period, sample_name: str = "sample.txt"):
    period_name = period_dictionaries[period]["train_period"]
    samples = sampling_folder / period_name / sample_name
    return samples


In [None]:
def train_glm_period(formula, period, sample_path, random_seed, model_id):
    # Create period folder
    period_output_folder = project.folders.glm_model_folder / period
    period_output_folder.mkdir(parents=True, exist_ok=True)
    # Set outputfile
    model_output = period_output_folder / f"glm_model_{model_id}.pickle"
    # Train GLM
    glm_trined = train_glm_from_formula(formula, sample_path, model_output, random_seed)
    return model_output


In [None]:
# Train calibration period
period_t = "calibration"

samples = get_samples_for_period(period_t, "sample.txt")

glm_trined_calibration = train_glm_period(
    training_formula, period_t, samples, random_seed, model_identifier_name
)


In [None]:
# Train calibration period
period_t = "historical"

samples = get_samples_for_period(period_t, "sample.txt")

glm_trined_historical = train_glm_period(
    training_formula, period_t, samples, random_seed, model_identifier_name
)


## Apply trained glm model

In [None]:
# Standard library imports
import os
import sys
import uuid

# Third party imports
import numpy as np
from osgeo import gdal
import pandas as pd
from patsy.build import build_design_matrices

# Local application imports
from forestatrisk.misc import rescale, makeblock


# predict_raster
def predict_raster(
    model_pickle,
    _x_design_info,
    period_dict="data",
    output_file="predictions.tif",
    blk_rows=128,
    verbose=True,
):
    """Predict the spatial probability of deforestation from a
    statistical model.

    This function predicts the spatial probability of deforestation
    from a statistical model. Computation are done by block and
    can be performed on large geographical areas.

    :param model: The model (glm, rf) to predict from. Must have a
        model.predict_proba() function.
    :param _x_design_info: Design matrix information from patsy.
    :param var_dir: Directory with rasters (.tif) of explicative variables.
    :param output_file: Name of the output raster file for predictions.
    :param blk_rows: If > 0, number of rows for computation by block.
    :param verbose: Logical. Whether to print messages or not. Default
        to ``True``.

    """
    # Read model and extract data
    model_pickle = pd.read_pickle(model_pickle)
    model = model_pickle.get("model")
    formula = model_pickle.get("formula")
    predictors_variable = sorted(extract_variables(formula, "predictors"))

    # Retrieve the corresponding file paths based on the sorted keys
    raster_list = [period_dict[key] for key in predictors_variable]

    # Get forest layer from period dictionary
    input_forest_raster = period_dict["forest"]

    # Mask on forest
    if verbose:
        print(f"Using {input_forest_raster} file")
    fmaskR = gdal.Open(input_forest_raster)
    fmaskB = fmaskR.GetRasterBand(1)

    # Landscape variables from forest raster
    gt = fmaskR.GetGeoTransform()
    ncol = fmaskR.RasterXSize
    nrow = fmaskR.RasterYSize
    Xmin = gt[0]
    Xmax = gt[0] + gt[1] * ncol
    Ymin = gt[3] + gt[5] * nrow
    Ymax = gt[3]

    # raster_names = []
    # for i in range(len(raster_list)):
    #     fname = os.path.basename(raster_list[i])
    #     index_dot = fname.index(".")
    #     raster_names.append(fname[:index_dot])
    var_names = predictors_variable
    var_names.extend(["X", "Y", "fmask"])
    # print(len(var_names), len(raster_list))
    # Make vrt with gdalbuildvrt
    if verbose:
        print("Make virtual raster with variables as raster bands")
    param = gdal.BuildVRTOptions(
        resolution="user",
        outputBounds=(Xmin, Ymin, Xmax, Ymax),
        xRes=gt[1],
        yRes=-gt[5],
        separate=True,
    )
    rand_uuid = uuid.uuid4()
    vrt_file = f"/vsimem/var_{rand_uuid}.vrt"
    cback = gdal.TermProgress_nocb if verbose else 0
    gdal.BuildVRT(vrt_file, raster_list, options=param, callback=cback)
    stack = gdal.Open(vrt_file)
    nband = stack.RasterCount
    proj = stack.GetProjection()

    # List of nodata values
    bandND = np.zeros(nband)
    for k in range(nband):
        band = stack.GetRasterBand(k + 1)
        bandND[k] = band.GetNoDataValue()
        if (bandND[k] is None) or (bandND[k] is np.nan):
            print(f"NoData value is not specified for input raster file {k}")
            sys.exit(1)
    bandND = bandND.astype(np.float32)

    # Make blocks
    blockinfo = makeblock(vrt_file, blk_rows=blk_rows)
    nblock = blockinfo[0]
    nblock_x = blockinfo[1]
    x = blockinfo[3]
    y = blockinfo[4]
    nx = blockinfo[5]
    ny = blockinfo[6]
    if verbose:
        print(f"Divide region in {nblock} blocks")

    # Raster of predictions
    if verbose:
        print("Create a raster file on disk for projections")
    driver = gdal.GetDriverByName("GTiff")
    try:
        os.remove(output_file)
    except FileNotFoundError:
        pass
    Pdrv = driver.Create(
        output_file,
        ncol,
        nrow,
        1,
        gdal.GDT_UInt16,
        ["COMPRESS=DEFLATE", "PREDICTOR=2", "BIGTIFF=YES"],
    )
    Pdrv.SetGeoTransform(gt)
    Pdrv.SetProjection(proj)
    Pband = Pdrv.GetRasterBand(1)
    Pband.SetNoDataValue(0)

    # Predict by block
    # Message
    if verbose:
        print("Predict deforestation probability by block")
    # Loop on blocks of data
    for b in range(nblock):
        # Position in 1D-arrays
        px = b % nblock_x
        py = b // nblock_x
        # Number of pixels
        npix = nx[px] * ny[py]
        # Data for one block of the stack (shape = (nband, nrow, ncol))
        data = stack.ReadAsArray(x[px], y[py], nx[px], ny[py])
        data = data.astype(float)  # From uint to float
        # Replace ND values with -9999
        for i in range(nband):
            data[i][np.nonzero(data[i] == bandND[i])] = -9999
        # Add a dimension if there is only one variable
        if len(data.shape) == 2:
            data = data[np.newaxis, :, :]
        # Coordinates of the center of the pixels of the block
        X_col = (
            gt[0] + x[px] * gt[1] + (np.arange(nx[px]) + 0.5) * gt[1]
        )  # +0.5 for center of pixels
        X = np.repeat(X_col[np.newaxis, :], ny[py], axis=0)
        X = X[np.newaxis, :, :]
        Y_row = (
            gt[3] + y[py] * gt[5] + (np.arange(ny[py]) + 0.5) * gt[5]
        )  # +0.5 for center of pixels
        Y = np.repeat(Y_row[:, np.newaxis], nx[px], axis=1)
        Y = Y[np.newaxis, :, :]
        # Forest mask
        fmaskA = fmaskB.ReadAsArray(x[px], y[py], nx[px], ny[py])
        fmaskA = fmaskA.astype(float)  # From uint to float
        fmaskA[np.nonzero(fmaskA != 1)] = -9999
        fmaskA = fmaskA[np.newaxis, :, :]
        # Concatenate forest mask with stack
        data = np.concatenate((data, X, Y, fmaskA), axis=0)
        # Transpose and reshape to 2D array
        data = data.transpose(1, 2, 0)
        data = data.reshape(npix, nband + 3)
        # Observations without NA
        w = np.nonzero(~(data == -9999).any(axis=1))
        # Remove observations with NA
        data = data[w]
        # Transform into a pandas DataFrame
        df = pd.DataFrame(data)
        df.columns = var_names
        # Add fake cell column for _x_design_info
        df["cell"] = 0
        # Predict
        pred = np.zeros(npix)  # Initialize with nodata value (0)
        if len(w[0]) > 0:
            # Get X
            (x_new,) = build_design_matrices([_x_design_info], df)
            X_new = x_new  # [:, :-1]
            # if "LogisticRegression" in str(model):
            #     X_new = x_new[:, :-1]
            # else:
            #     X_new = x_new[:, 1:-1]
            # Get predictions into an array
            p = model.predict_proba(X_new)[:, 1]
            # Rescale and return to pred
            pred[w] = rescale(p)
        # Assign prediction to raster
        pred = pred.reshape(ny[py], nx[px])
        Pband.WriteArray(pred, x[px], y[py])

    # Compute statistics
    if verbose:
        print("Compute statistics")
    Pband.FlushCache()  # Write cache data to disk
    Pband.ComputeStatistics(False)

    # Dereference driver
    Pband = None
    del Pdrv


# End


In [None]:
import pickle
from patsy import dmatrices
import forestatrisk
from component.script.far_helpers import get_design_info


def apply_glm_period(
    period_dictionaries,
    period,
    model,
):
    period_dictionary = period_dictionaries[period]
    period_output_folder = glm_model_folder / period
    period_output_folder.mkdir(parents=True, exist_ok=True)
    prediction_output = (
        period_output_folder / f"glm_{period}_{model_identifier_name}.tif"
    )

    # Load model
    model_f = pd.read_pickle(model)
    formula = model_f.get("formula")
    samples = model_f.get("samples_path")
    (y_design_info, x_design_info) = get_design_info(formula, samples)
    time_interval = period_dictionary["time_interval"]

    predict_raster(
        model,
        x_design_info,
        period_dictionary,
        prediction_output,
        blk_rows=256,
        verbose=True,
    )

    # defrate_per_cat
    print("Calculate deforestation rate per cathegory")
    defrate_output = str(
        period_output_folder
        / f"defrate_cat_glm_{period_dictionary['period']}_{model_identifier_name}.csv"
    )
    forestatrisk.defrate_per_cat(
        forest_change_file,
        str(prediction_output),
        time_interval,
        period,
        defrate_output,
        256,
        False,
    )


In [None]:
def get_trained_model(period_dictionaries, period, model_name):
    period_name = period_dictionaries[period]["train_period"]
    model_period_folder = glm_model_folder / period_name
    model = model_period_folder / model_name
    return model


In [None]:
# Predict over calibration period

period_c = "calibration"


model = get_trained_model(
    period_dictionaries, period_c, f"glm_model_{model_identifier_name}.pickle"
)

glm_predict_calibration = apply_glm_period(
    period_dictionaries,
    period_c,
    model,
)


In [None]:
# Predict over calibration period

period_c = "validation"


model = get_trained_model(
    period_dictionaries, period_c, f"glm_model_{model_identifier_name}.pickle"
)

glm_predict_validation = apply_glm_period(
    period_dictionaries,
    period_c,
    model,
)


In [None]:
# Predict over historical period

period_c = "historical"


model = get_trained_model(
    period_dictionaries, period_c, f"glm_model_{model_identifier_name}.pickle"
)

glm_predict_historical = apply_glm_period(
    period_dictionaries,
    period_c,
    model,
)


In [None]:
# Predict over historical period

period_c = "forecast"


model = get_trained_model(
    period_dictionaries, period_c, f"glm_model_{model_identifier_name}.pickle"
)

glm_predict_forecast = apply_glm_period(
    period_dictionaries,
    period_c,
    model,
)


In [None]:
print("Done!")
