# Bayesian MBG Predictions (batched)

In [23]:
import functions

import pymc as pm

import os
import pickle
import numpy as np
import arviz as az
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.preprocessing import StandardScaler
import pytensor.tensor as at

import uuid
import pytensor.tensor as at
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


## Geostatistical Modeling

### Parameters

In [24]:
#Load report
with open('temp_files/report/report.pkl', 'rb') as pickle_file:
    report = pickle.load(pickle_file)

In [25]:
#Define the directory where the pickle files are stored
pickle_dir = 'temp_files'

target_indicator = report['Target Indicator']

# Get the number of CPU cores to max out the machine in the traning stage
num_cores = os.cpu_count()

print(f"Number of CPU cores: {num_cores}")

Number of CPU cores: 8


### Load target and covariates

In [26]:
gdf = pd.read_pickle('temp_files/selected_features.pkl')

In [27]:
selected_features = gdf.columns.to_list()

# Remove target_values and others
remove_list = [target_indicator, 'geometry', 'grid_id']

# Remove elements in remove_list from main_list
selected_features = [item for item in selected_features if item not in remove_list]

In [28]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Extract coordinates from the geometry column (using centroid for Polygons)
coordinates = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf.geometry])

#Standardize
coordinates = scaler.fit_transform(coordinates)

# Extract coordinates from the geometry column (using centroid for Polygons) only for observed rows
coordinates_observed = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf[~gdf[target_indicator].isnull()].geometry])

# Standardize
coordinates_observed = scaler.fit_transform(coordinates_observed)

In [29]:
#Only rows with observed target indicator
df1 = gdf[~gdf[target_indicator].isnull()]

In [30]:
# Select and apply the best transformation
target_transformed, transform, lmda = functions.select_transformation(df1[[target_indicator]])

#Store important information for reversion

report['Transformation Applied'] = transform #Transformation applied to target
report['Lambda'] = lmda #Store lambda variable for some reversion processes

Skewness: [0.70480152], Kurtosis: [0.03634722]
Applying square root transformation due to moderate positive skewness and non-positive values.


In [31]:
#Replace target for target_transformed in the df
df1[target_indicator] = target_transformed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [32]:
# Transformed target variable
y = df1[target_indicator].values

In [33]:
# Covariate matrix
X = df1[selected_features].values

In [34]:
# Standardize features and transformed y

scaler_x = StandardScaler()
X = scaler_x.fit_transform(X)

scaler_y = StandardScaler()
y = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

In [35]:
# Save the scaler to a pickle file
with open('scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)

### Recreate the saved model

In [36]:
# After fitting the model
trace_filename = 'model_trace.nc'

## Predictions

### Loading the trained model

In [37]:
# Load the model
idata = az.from_netcdf(trace_filename)

# Re-create the model
with pm.Model() as model:
    # Re-create the model's priors and components
    beta = pm.Normal('beta', mu=report['Target mean'], sigma=report['Target std'], shape=len(selected_features))
    sigma = pm.HalfNormal('sigma', sigma=report['Target std'])
    ls = pm.HalfCauchy('ls', beta=report['Variogram Range'])

    # Distance matrix for the GP kernel
    D = np.sqrt(((coordinates_observed[:, None, :] - coordinates_observed[None, :, :])**2).sum(axis=-1))

    # Define the covariance function (Matern52 with jitter for numerical stability)
    K = pm.gp.cov.Matern52(2, ls=ls)
    
    # Define the GP latent model
    gp = pm.gp.Latent()
    f = gp.prior('f', X=coordinates_observed)

    # Linear combination of covariates and GP
    mu = pm.math.dot(X, beta) + f
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)



In [38]:
def generate_predictions_no_noise(model, coordinates_new, X_new, idata):
    """
    Generate predictions for new data using a Gaussian Process model, leveraging
    the posterior mean for making predictions. This version does NOT add noise to the predictions.
    
    Returns:
    - mean_predictions: The predicted mean for each data point (without noise).
    """
    with model:
        unique_name = "f_pred_" + str(uuid.uuid4())

        # Generate the conditional GP for the new locations (provides mean and variance)
        f_pred = gp.conditional(unique_name, coordinates_new)

        # Extract the posterior mean for all variables
        posterior_mean = {var: idata.posterior[var].mean(dim=["chain", "draw"]).values for var in idata.posterior.data_vars}

        # Generate posterior predictive samples for the GP predictions
        pred_samples = pm.sample_posterior_predictive(idata, var_names=[unique_name], return_inferencedata=True)
        
        # Get the mean and variance for the GP predictions
        f_pred_mean = pred_samples.posterior_predictive[unique_name].mean(axis=0)
        f_pred_var = pred_samples.posterior_predictive[unique_name].var(axis=0)
        
        # Compute the mean of the beta samples (from posterior)
        beta_mean = posterior_mean['beta']

        print("beta_mean:", beta_mean)  # Should be around the scale of the standardized covariates

        # Compute the linear term for the new data points
        lin_pred = np.dot(X_new, beta_mean)

        # Add the linear term to the GP prediction mean
        mean_predictions = lin_pred + f_pred_mean

        return mean_predictions


### Load the new oobservations

In [39]:
#gdf2 = gdf[gdf[target_indicator].isnull()] #Activate this to exclude the 

#DF with all observations
gdf2 = gdf.copy(deep=True)
df2 = gdf2.copy(deep=True)
df2.reset_index(drop=True, inplace=True)

# Extract coordinates from the geometry column (using centroid for Polygons) only for unobserved rows
coordinates_new = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf2.geometry])

# Standardize
coordinates_new = scaler.fit_transform(coordinates_new)

# Covariate matrix
X_new = df2[selected_features].values

# Standardize the new data using the same scaler fitted on the observed data
X_new = scaler_x.transform(X_new)

### Predictions

In [40]:
begin_from_scratch = True
    
if begin_from_scratch == True: 
    #Dataframe to store predictions and other values for uncertainty calculations
    df3 = pd.DataFrame(df2[['grid_id', 'mpi']].head(0))
    start_position=0

else:
    #Pick-up productions were we left of. 
    with open('temp_files/predictions.pkl', 'rb') as pickle_file:
        df3 = pickle.load(pickle_file)
    start_position = df3.index.max() + 1

In [41]:
start_position

0

In [42]:
# Assuming 95% confidence level, change alpha for different confidence levels
alpha = 0.05
z_score = stats.norm.ppf(1 - alpha/2)

In [43]:
#step = 434
step = 100

In [44]:
for i in range(start_position, len(df2)-1, step):

    r = range(i, i+step)
    
    print(r)
    
    X_new_i = X_new[r]
    
    coordinates_new_i = coordinates_new[r]
    
    #predictions_i = generate_predictions(model, coordinates_new_i, X_new_i, idata, max_attempts=2)

    predictions_i = generate_predictions_no_noise(model, coordinates_new_i, X_new_i, idata)
    
    # Extracting relevant data
    dfi = df2.loc[r][['grid_id', target_indicator]]
    dfpi = pd.DataFrame(predictions_i)
    
    # Calculating mean, standard deviation, and other statistics
 
    dfi[target_indicator] = dfpi.mean().values  # Mean prediction for each observation
    dfi['std'] = dfpi.std().values              # Standard deviation for each observation
    dfi['max'] = dfpi.max().values              # Max prediction for each observation
    dfi['min'] = dfpi.min().values              # Min prediction for each observation
     
    # Calculating the margin of error for the confidence interval
    dfi['standard_error'] = dfi['std'] / (len(dfpi)**0.5)
    dfi['margin_of_error'] = z_score * dfi['standard_error']
    
    # Calculating the confidence interval
    dfi['ci_lower'] = dfi[target_indicator] - dfi['margin_of_error']
    dfi['ci_upper'] = dfi[target_indicator] + dfi['margin_of_error']
    
    df3 = pd.concat([df3, dfi])

    #Save predictions so far
    df3.to_pickle('temp_files/predictions.pkl')


range(0, 100)


Sampling: [f_pred_e9202ca6-fce5-4943-bfbd-e39a1bb927c4]


Output()

Sampling: [f_pred_63acf817-ae7d-4521-9228-e4509db82cba]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(100, 200)


Sampling: [f_pred_f3a1dd26-b07b-47cb-8980-2bddf62e664e]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(200, 300)


beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(300, 400)


Sampling: [f_pred_e73b6d54-38bb-4be1-87c4-dbddfe49dac6]


Output()

Sampling: [f_pred_59dc75b1-7885-4da8-8b36-ce6527986a4f]


Output()

Sampling: [f_pred_14d574a3-c8af-4263-85a3-37d5592151e0]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(500, 600)


Sampling: [f_pred_c51d3b2f-bbdf-474f-94cc-62c0dc7814df]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(600, 700)


Sampling: [f_pred_9a0ea6e2-49a9-415f-9ec9-c86a2562746e]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(700, 800)


Sampling: [f_pred_c1a605c0-5e7a-4a59-97ae-9299fbfde133]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(800, 900)


Sampling: [f_pred_34842120-40f8-4c88-a328-2ea1f35f010f]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(900, 1000)


Sampling: [f_pred_85393c43-baa4-44f8-b31a-e566af360ed5]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1000, 1100)


Sampling: [f_pred_d387f0ae-a204-4795-8bbc-e074cb9bf08f]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1100, 1200)


Sampling: [f_pred_33666d38-fbfc-49dd-95dc-ab2d71c30750]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1200, 1300)


Sampling: [f_pred_9d38bb35-069f-41c1-b5fc-2a965bb43b26]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1300, 1400)


Sampling: [f_pred_c9612cf3-a97d-412e-9556-cfe11d7f5695]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1400, 1500)


Sampling: [f_pred_7cca12ba-1d74-4d11-aab4-be33eb1fbd10]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1500, 1600)


Sampling: [f_pred_5313dcf3-8791-40b1-9f49-121f663323ba]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1600, 1700)


Sampling: [f_pred_95b5498c-27a6-4c11-9c71-6ec6b7158416]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1700, 1800)


Sampling: [f_pred_482d397b-40da-49ac-8e53-8a4d6cee313d]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1800, 1900)


Sampling: [f_pred_f9a34c2c-28eb-4491-b012-c515e9100c90]


Output()

beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
range(1900, 2000)


beta_mean: [-0.05784193  0.17113525  0.18322051  0.01784215  0.09669733  0.05336988
  0.18378545 -0.12905568 -0.01680348  0.05687868]
