# Bayesian MBG Predictions (batched)

In [11]:
import functions

import pymc as pm

import os
import pickle
import numpy as np
import arviz as az
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.preprocessing import StandardScaler
import pytensor.tensor as at

import uuid
import pytensor.tensor as at
import scipy.stats as stats


## Geostatistical Modeling

### Parameters

In [12]:
#Load report
with open('temp_files/report/report.pkl', 'rb') as pickle_file:
    report = pickle.load(pickle_file)

In [13]:
#Define the directory where the pickle files are stored
pickle_dir = 'temp_files'

target_indicator = report['Target Indicator']

# Get the number of CPU cores to max out the machine in the traning stage
num_cores = os.cpu_count()

print(f"Number of CPU cores: {num_cores}")

Number of CPU cores: 8


### Load target and covariates

In [19]:
gdf = pd.read_pickle('temp_files/selected_features.pkl')

In [20]:
selected_features = gdf.columns.to_list()

# Remove target_values and others
remove_list = [target_indicator, 'geometry', 'grid_id']

# Remove elements in remove_list from main_list
selected_features = [item for item in selected_features if item not in remove_list]

In [21]:
# Extract coordinates from the geometry column (using centroid for Polygons)
coordinates = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf.geometry])

# Extract coordinates from the geometry column (using centroid for Polygons) only for observed rows
coordinates_observed = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf[~gdf[target_indicator].isnull()].geometry])

In [28]:
#Only rows with observed target indicator
df1 = gdf[~gdf[target_indicator].isnull()]

In [29]:
# Select and apply the best transformation
target_transformed, transform, lmda = functions.select_transformation(df1[[target_indicator]])

#Store important information for reversion

report['Transformation Applied'] = transform #Transformation applied to target
report['Lambda'] = lmda #Store lambda variable for some reversion processes

Skewness: [0.63607374], Kurtosis: [-0.1470957]
Applying square root transformation due to moderate positive skewness and non-positive values.


In [30]:
#Replace target for target_transformed in the df
df1[target_indicator] = target_transformed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [32]:
# Transformed target variable
y = df1[target_indicator].values

In [33]:
# Covariate matrix
X = df1[selected_features].values

In [34]:
# Standardize features and transformed y
X = (X - X.mean(axis=0)) / X.std(axis=0)
y = (y - y.mean()) / y.std()

### Recreate the saved model

In [35]:
# After fitting the model
trace_filename = 'model_trace.nc'

In [36]:
#Load the model
idata = az.from_netcdf(trace_filename)

# Re-create the model
with pm.Model() as model:
    # Re-create the model's priors and components
    beta = pm.Normal('beta', mu=0, sigma=1, shape=len(selected_features))
    sigma = pm.HalfNormal('sigma', sigma=1)
    ls = pm.HalfCauchy('ls', beta=1)

    D = np.sqrt(((coordinates_observed[:, None, :] - coordinates_observed[None, :, :])**2).sum(axis=-1))

    K = pm.gp.cov.Matern52(2, ls=ls)
    gp = pm.gp.Latent(cov_func=K)
    f = gp.prior('f', X=coordinates_observed)

    mu = pm.math.dot(X, beta) + f
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)

## Generating predictions for all grids

In [62]:
df2 = gdf[gdf[target_indicator].isnull()]

#df2 = df2.sample(400) #temporary to make some tests

df2.reset_index(drop=True, inplace=True)

# Covariate matrix
X_new = df2[selected_features].values

# Standardize the new data using the same scaler fitted on the observed data
X_new = (X_new - X_new.mean(axis=0)) / X_new.std(axis=0)

# Extract coordinates from the geometry column (using centroid for Polygons) only for unobserved rows
coordinates_new = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf[gdf[target_indicator].isnull()].geometry])

### Check if the covariance matrix is PSD
- PSD: Positive Semi-Definitive

In [63]:
#This function is key to diagnose what is going on inside the Gaussian process

def diagnose_covariance_matrix(cov, jitter=1e-6):
    """
    Diagnose potential issues with a covariance matrix and suggest possible remedies.

    Parameters:
    ----------
    cov : np.ndarray or pytensor.tensor
        The covariance matrix to diagnose.
    jitter : float, optional
        The amount of jitter to add to the diagonal of the covariance matrix for stabilization.

    Returns:
    -------
    None
    """

    # Convert pytensor tensor to numpy array for diagnosis if necessary
    if isinstance(cov, at.TensorVariable):
        # Use pm.draw to evaluate the tensor as a NumPy array
        cov = pm.draw(cov)

    # Check for symmetry
    if not np.allclose(cov, cov.T):
        print("Warning: Covariance matrix is not symmetric.")
    else:
        print("Covariance matrix is symmetric.")

    # Check for positive semi-definiteness using eigenvalues
    eigvals = np.linalg.eigvalsh(cov)
    if np.all(eigvals >= 0):
        print("Covariance matrix is positive semi-definite (PSD).")
    elif np.all(eigvals > 0):
        print("Covariance matrix is positive definite (PD).")
    else:
        print("Covariance matrix is not positive semi-definite (non-PSD).")
        print("Eigenvalues:")
        print(eigvals)

    # Check for small or negative eigenvalues
    if np.any(eigvals < 0):
        print("There are negative eigenvalues, indicating non-PSD matrix.")
    elif np.any(eigvals == 0):
        print("There are zero eigenvalues, indicating the matrix is singular or nearly singular.")
    if np.any(eigvals < jitter):
        print("Some eigenvalues are smaller than the jitter value. Consider increasing jitter.")

    # Check the condition number (ratio of max to min eigenvalue)
    cond_number = np.linalg.cond(cov)
    print(f"Condition number of the matrix: {cond_number:.2e}")
    if cond_number > 1e10:
        print("Warning: Covariance matrix is ill-conditioned (large condition number).")
        print("Consider regularization or using a different covariance function.")

    # Suggest adding jitter and re-check PSD
    cov_with_jitter = cov + jitter * np.eye(cov.shape[0])
    eigvals_with_jitter = np.linalg.eigvalsh(cov_with_jitter)
    if np.all(eigvals_with_jitter >= 0):
        print("Adding jitter made the covariance matrix positive semi-definite.")
    else:
        print("Even after adding jitter, the matrix is still not positive semi-definite.")

    # Check for numerical issues using Cholesky decomposition
    try:
        np.linalg.cholesky(cov)
        print("Cholesky decomposition succeeded: Covariance matrix is positive definite.")
    except np.linalg.LinAlgError:
        print("Cholesky decomposition failed: Covariance matrix is not positive definite.")

    print("\nDiagnosis Complete.")

def generate_predictions(model, coordinates_new, X_new, idata, initial_jitter=1e-6, max_attempts=5):
    """
    Generate predictions for new data using a Gaussian Process model.

    Parameters:
    ----------
    model : pm.Model
        The PyMC model object that contains the Gaussian Process.
    coordinates_new : np.ndarray
        An array of coordinates for the new data points where predictions are needed.
    X_new : np.ndarray
        The covariate matrix for the new data points.
    idata : az.InferenceData
        The InferenceData object containing posterior samples from the fitted model.
    initial_jitter : float, optional
        The initial jitter value to add to the covariance matrix to ensure positive definiteness.
    max_attempts : int, optional
        Maximum number of attempts to find a stable jitter value.

    Returns:
    -------
    np.ndarray
        An array of mean predictions for the new data points.
    """

    with model:
        for attempt in range(max_attempts):
            try:
                jitter = initial_jitter * (10 ** attempt)
                unique_name = "f_pred_" + str(uuid.uuid4())

                # Generate the conditional GP with added jitter to the covariance matrix
                f_pred = gp.conditional(unique_name, coordinates_new, jitter=jitter)

                # Compute the mean of the beta samples
                beta_mean = idata.posterior['beta'].mean(dim=("chain", "draw")).values

                # Predictive mean
                mu_pred = pm.math.dot(X_new, beta_mean) + f_pred

                # Create the covariance matrix using PyMC's Matern32
                cov = pm.gp.cov.Matern32(coordinates_new.shape[1], ls=1.0)(coordinates_new)

                # Add jitter using Pytensor's identity matrix
                cov += jitter * at.eye(cov.shape[0])

                # Symmetrize the covariance matrix to ensure symmetry
                cov = (cov + cov.T) / 2

                # Check cov_matrix before predictions
                diagnose_covariance_matrix(cov)
                
                # Check for positive definiteness using Cholesky decomposition
                _ = at.slinalg.cholesky(cov)

                # If successful, proceed with prediction
                pred_samples = pm.sample_posterior_predictive(idata, var_names=[unique_name], return_inferencedata=True)
                return pred_samples.posterior_predictive[unique_name].mean(axis=0)

            except Exception as e:
                if attempt == max_attempts - 1:
                    raise ValueError(f"The covariance matrix is not positive semi-definite even after {max_attempts} attempts with increasing jitter.") from e


In [64]:
#Dataframe to store predictions and other values for uncertainty calculations
df3 = pd.DataFrame(df2[['grid_id', 'mpi']].head(0))
begin_from_scratch = True

In [44]:
#Pick-up productions were we left of. 
with open('temp_files/predictions.pkl', 'rb') as pickle_file:
    df3 = pickle.load(pickle_file)

In [65]:
start_position = df3.index.max() + 1

#Zero if df3 has been just initialized
if begin_from_scratch == True:
    start_position = 0

In [66]:
start_position

0

In [67]:
# Assuming 95% confidence level, change alpha for different confidence levels
alpha = 0.05
z_score = stats.norm.ppf(1 - alpha/2)

In [68]:
step = 100

In [None]:
for i in range(start_position, len(df2), step):

    r = range(i, i+step)

    print(r)
    
    X_new_i = X_new[r]
    
    coordinates_new_i = coordinates_new[r]
    
    predictions_i = generate_predictions(model, coordinates_new_i, X_new_i, idata)
    
    # Extracting relevant data
    dfi = df2.loc[r][['grid_id', target_indicator]]
    dfpi = pd.DataFrame(predictions_i)
    
    # Calculating mean, standard deviation, and other statistics
 
    dfi[target_indicator] = dfpi.mean().values  # Mean prediction for each observation
    dfi['std'] = dfpi.std().values              # Standard deviation for each observation
    dfi['max'] = dfpi.max().values              # Max prediction for each observation
    dfi['min'] = dfpi.min().values              # Min prediction for each observation
     
    # Calculating the margin of error for the confidence interval
    dfi['standard_error'] = dfi['std'] / (len(dfpi)**0.5)
    dfi['margin_of_error'] = z_score * dfi['standard_error']
    
    # Calculating the confidence interval
    dfi['ci_lower'] = dfi[target_indicator] - dfi['margin_of_error']
    dfi['ci_upper'] = dfi[target_indicator] + dfi['margin_of_error']
    
    df3 = pd.concat([df3, dfi])

    #Save predictions so far
    df3.to_pickle('temp_files/predictions.pkl')


range(0, 100)
Covariance matrix is symmetric.
Covariance matrix is positive semi-definite (PSD).
Condition number of the matrix: 1.96e+06
Adding jitter made the covariance matrix positive semi-definite.
Cholesky decomposition succeeded: Covariance matrix is positive definite.

Diagnosis Complete.


Sampling: [f_pred_c813c011-427d-4b10-b134-cd791ad303ad]


Output()

Covariance matrix is symmetric.
Covariance matrix is positive semi-definite (PSD).
Condition number of the matrix: 1.48e+06
Adding jitter made the covariance matrix positive semi-definite.
Cholesky decomposition succeeded: Covariance matrix is positive definite.

Diagnosis Complete.


Sampling: [f_pred_3b30daa5-abf4-45f4-9939-e5c965e85b98]


Output()

Covariance matrix is symmetric.
Covariance matrix is positive semi-definite (PSD).
Condition number of the matrix: 4.35e+05
Adding jitter made the covariance matrix positive semi-definite.
Cholesky decomposition succeeded: Covariance matrix is positive definite.

Diagnosis Complete.


Sampling: [f_pred_41e55111-f828-4f24-b0d5-c43e0d4f1c9d]


Output()

Covariance matrix is symmetric.
Covariance matrix is positive semi-definite (PSD).
Condition number of the matrix: 5.39e+04
Adding jitter made the covariance matrix positive semi-definite.
Cholesky decomposition succeeded: Covariance matrix is positive definite.

Diagnosis Complete.


Sampling: [f_pred_4c8b28ff-71da-46cc-a58b-f93dc582e44e]


Output()