# Bayesian MBG Predictions (batched)

In [1]:
import functions

import pymc as pm

import os
import pickle
import numpy as np
import arviz as az
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.preprocessing import StandardScaler
import pytensor.tensor as at

import uuid
import pytensor.tensor as at
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


## Geostatistical Modeling

### Parameters

In [2]:
#Load report
with open('temp_files/report/report.pkl', 'rb') as pickle_file:
    report = pickle.load(pickle_file)

In [3]:
#Define the directory where the pickle files are stored
pickle_dir = 'temp_files'

target_indicator = report['Target Indicator']

# Get the number of CPU cores to max out the machine in the traning stage
num_cores = os.cpu_count()

print(f"Number of CPU cores: {num_cores}")

Number of CPU cores: 8


### Load target and covariates

In [4]:
gdf = pd.read_pickle('temp_files/selected_features.pkl')
gdf_full = pd.read_pickle('temp_files/selected_features_full.pkl')

In [5]:
selected_features = gdf.columns.to_list()

# Remove target_values and others
remove_list = [target_indicator, 'geometry', 'grid_id']

# Remove elements in remove_list from main_list
selected_features = [item for item in selected_features if item not in remove_list]

In [6]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Extract coordinates from the geometry column (using centroid for Polygons)
coordinates = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf.geometry])

#Standardize
coordinates = scaler.fit_transform(coordinates)

# Extract coordinates from the geometry column (using centroid for Polygons) only for observed rows
coordinates_observed = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf[~gdf[target_indicator].isnull()].geometry])

# Standardize
coordinates_observed = scaler.fit_transform(coordinates_observed)

In [7]:
#Only rows with observed target indicator
df1 = gdf[~gdf[target_indicator].isnull()]

In [8]:
# Select and apply the best transformation
target_transformed, transform, lmda = functions.select_transformation(df1[[target_indicator]])

#Store important information for reversion

report['Transformation Applied'] = transform #Transformation applied to target
report['Lambda'] = lmda #Store lambda variable for some reversion processes

Skewness: [0.71940191], Kurtosis: [0.03423027]
Applying square root transformation due to moderate positive skewness and non-positive values.


In [9]:
#Replace target for target_transformed in the df
df1[target_indicator] = target_transformed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [10]:
# Transformed target variable
y = df1[target_indicator].values

In [11]:
# Covariate matrix
X = df1[selected_features].values

In [12]:
# Standardize features and transformed y

scaler_x = StandardScaler()
X = scaler_x.fit_transform(X)

scaler_y = StandardScaler()
y = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

In [13]:
# Save the scaler to a pickle file
with open('scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)

### Recreate the saved model

In [14]:
# After fitting the model
trace_filename = 'model_trace.nc'

## Predictions

### Loading the trained model

In [15]:
# Load the model
idata = az.from_netcdf(trace_filename)

# Re-create the model
with pm.Model() as model:
    # Re-create the model's priors and components
    #beta = pm.Normal('beta', mu=report['Target mean'], sigma=report['Target std'], shape=len(selected_features))
    beta = pm.Normal('beta', mu=0, sigma=1, shape=len(selected_features))    
    
    #sigma = pm.HalfNormal('sigma', sigma=report['Target std'])
    sigma = pm.HalfNormal('sigma', sigma=1)
    
    #ls = pm.HalfCauchy('ls', beta=report['Suggested ls beta'])
    ls = pm.HalfCauchy('ls', beta=0.3)

    # Distance matrix for the GP kernel
    D = np.sqrt(((coordinates_observed[:, None, :] - coordinates_observed[None, :, :])**2).sum(axis=-1))

    # Define the covariance function
    K = pm.gp.cov.Matern52(2, ls=ls)
    
    # Define the GP latent model
    gp = pm.gp.Latent()
    f = gp.prior('f', X=coordinates_observed)

    # Linear combination of covariates and GP
    mu = pm.math.dot(X, beta) + f
    y_obs = pm.Normal('y_obs', mu=mu, sigma=sigma, observed=y)



In [16]:
def generate_predictions_no_noise(model, coordinates_new, X_new, idata):
    """
    Generate predictions for new data using a Gaussian Process model, leveraging
    the posterior mean for making predictions. This version does NOT add noise to the predictions.
    
    Returns:
    - mean_predictions: The predicted mean for each data point (without noise).
    """
    with model:
        unique_name = "f_pred_" + str(uuid.uuid4())

        # Generate the conditional GP for the new locations (provides mean and variance)
        f_pred = gp.conditional(unique_name, coordinates_new, jitter=1e-3)

        # Extract the posterior mean for all variables
        posterior_mean = {var: idata.posterior[var].mean(dim=["chain", "draw"]).values for var in idata.posterior.data_vars}

        # Generate posterior predictive samples for the GP predictions
        pred_samples = pm.sample_posterior_predictive(idata, var_names=[unique_name, 'beta'], return_inferencedata=True)
        
        # Get the mean and variance for the GP predictions
        f_pred_mean = pred_samples.posterior_predictive[unique_name].mean(axis=0)

        
        f_pred_var = pred_samples.posterior_predictive[unique_name].var(axis=0)
        
        # Compute the mean of the beta samples (from posterior)
        beta_mean = posterior_mean['beta']

        # Compute the linear term for the new data points
        lin_pred = np.dot(X_new, beta_mean)

        # Add the linear term to the GP prediction mean
        mean_predictions = lin_pred + f_pred_mean

        return mean_predictions


In [17]:
#DF with all observations
gdf2 = gdf_full.copy(deep=True) #This line to make predictions for all observations
df2 = gdf2.copy(deep=True)
df2.reset_index(drop=True, inplace=True)

# Extract coordinates from the geometry column (using centroid for Pcolygons)
coordinates_new = np.array([(geom.centroid.x, geom.centroid.y) for geom in gdf2.geometry])

# Standardize
coordinates_new = scaler.fit_transform(coordinates_new)

# Covariate matrix
X_new = df2[selected_features].values

# Standardize the new data using the same scaler fitted on the observed data
X_new = scaler_x.transform(X_new)

### Predictions

In [18]:
begin_from_scratch = True
    
if begin_from_scratch == True: 
    #Dataframe to store predictions and other values for uncertainty calculations
    df3 = pd.DataFrame(df2[['grid_id', 'mpi']].head(0))
    start_position=0

else:
    #Pick-up productions were we left of. 
    with open('temp_files/predictions.pkl', 'rb') as pickle_file:
        df3 = pickle.load(pickle_file)
    start_position = df3.index.max() + 1

In [19]:
start_position

0

In [20]:
# Assuming 95% confidence level, change alpha for different confidence levels
alpha = 0.05
z_score = stats.norm.ppf(1 - alpha/2)

In [21]:
step = 100

In [None]:
# Step is the number of observations to select in each iteration
num_observations = len(X_new)
all_indices = np.arange(num_observations)

for i in range(start_position, num_observations, step):
    
    # Randomly select 'step' indices without replacement for each iteration
    if step <= len(all_indices):
        r = np.random.choice(all_indices, size=step, replace=False)
        all_indices = np.setdiff1d(all_indices, r)  # Remove the selected indices from the pool
    else:
        # If fewer than 'step' indices remain, select all remaining indices
        r = all_indices
        all_indices = np.array([])

    print(f"Remaining indices: {len(all_indices)}")

    X_new_i = X_new[r]
    coordinates_new_i = coordinates_new[r]

    predictions_i = generate_predictions_no_noise(model, coordinates_new_i, X_new_i, idata)
    
    # Extracting relevant data
    dfi = df2.loc[r][['grid_id', target_indicator]]
    dfpi = pd.DataFrame(predictions_i)

    dfpi.to_pickle('temp_files/dfpi.pkl')
    
    # Calculating mean, standard deviation, and other statistics
    dfi[target_indicator] = dfpi.mean().values  # Mean prediction for each observation
    dfi['std'] = dfpi.std().values              # Standard deviation for each observation
    dfi['max'] = dfpi.max().values              # Max prediction for each observation
    dfi['min'] = dfpi.min().values              # Min prediction for each observation
    
    # Calculating the margin of error for the confidence interval
    dfi['standard_error'] = dfi['std'] / (len(dfpi)**0.5)
    dfi['margin_of_error'] = z_score * dfi['standard_error']
    
    # Calculating the confidence interval
    dfi['ci_lower'] = dfi[target_indicator] - dfi['margin_of_error']
    dfi['ci_upper'] = dfi[target_indicator] + dfi['margin_of_error']
    
    # Combine with previous results
    df3 = pd.concat([df3, dfi])

    # Save predictions so far
    df3.to_pickle('temp_files/predictions.pkl')

    # Break the loop if no more indices are left to process
    if len(all_indices) == 0:
        break
        

Remaining indices: 7567


Sampling: [beta, f_pred_b011a849-5750-4124-8d94-8f8bae4412a5]


Output()

# Useful code

In [37]:
def generate_predictions_no_noise(model, coordinates_new, X_new, idata):
    """
    Generate predictions for new data using a Gaussian Process model, leveraging
    the posterior mean for making predictions. This version does NOT add noise to the predictions.
    
    Returns:
    - mean_predictions: The predicted mean for each data point (without noise).
    """
    with model:
        unique_name = "f_pred_" + str(uuid.uuid4())

        # Generate the conditional GP for the new locations (provides mean and variance)
        f_pred = gp.conditional(unique_name, coordinates_new, jitter=1e-3)

        # Extract the posterior mean for all variables
        posterior_mean = {var: idata.posterior[var].mean(dim=["chain", "draw"]).values for var in idata.posterior.data_vars}

        # Generate posterior predictive samples for the GP predictions
        pred_samples = pm.sample_posterior_predictive(idata, var_names=[unique_name, 'beta'], return_inferencedata=True)
        
        # Get the mean and variance for the GP predictions
        f_pred_mean = pred_samples.posterior_predictive[unique_name].mean(axis=0)

        
        f_pred_var = pred_samples.posterior_predictive[unique_name].var(axis=0)
        
        # Compute the mean of the beta samples (from posterior)
        beta_mean = posterior_mean['beta']

        # Compute the linear term for the new data points
        lin_pred = np.dot(X_new, beta_mean)

        # Add the linear term to the GP prediction mean
        mean_predictions = lin_pred + f_pred_mean

        return f_pred_mean


In [38]:
# Step is the number of observations to select in each iteration
num_observations = len(X_new)
all_indices = np.arange(num_observations)

r = np.random.choice(all_indices, size=100, replace=False)
all_indices = np.setdiff1d(all_indices, r)  # Remove the selected indices from the pool

X_new_i = X_new[r]
coordinates_new_i = coordinates_new[r]

predictions_i = generate_predictions_no_noise(model, coordinates_new_i, X_new_i, idata)

Sampling: [beta, f_pred_7b3b3c37-8e71-40b1-b4dd-b01312203506]


Output()