In [1]:
import os

# os.chdir("/home/dahala/Transformer-Gorkha/")
import tensorflow as tf
import json
import numpy as np
import seaborn as sns
from src import preparedata
from src import transformermodel
from src import traintransformer
import matplotlib.pyplot as plt
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from tqdm.notebook import tqdm


In [2]:
# put inference true
params = json.load(open("params/params.json", "r"))
dataset = preparedata.readTransformerData(params["dataprepinargs"])
dataset.preparedata()

In [None]:
#landslidehazard.model.load_weights("checkpoints/DS_daily_75.h5")
model = tf.keras.models.load_model("checkpoints/DS_daily_75.keras")

In [None]:
#landslidehazard.model.summary()
model.summary()

In [None]:
dataset.Xt.shape

## Attention weights and Grad_AAM

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Assuming 'multi_head_attention' is the name of your attention layer
attention_layer = model.get_layer("multi_head_attention")

# Inputs to the model
input_Xt = dataset.Xt  # Temporal input
input_Xc = dataset.Xc  # Static input

with tf.GradientTape() as tape:
    # Forward pass through the attention layer
    query = input_Xt
    key = input_Xt
    value = input_Xt
    _, attention_weights = attention_layer(
        query=query, key=key, value=value, return_attention_scores=True
    )

    # Watch attention weights
    tape.watch(attention_weights)

    # Forward pass through the full model
    prediction = model([input_Xt, input_Xc])

# Compute gradients of the prediction with respect to attention weights
gradients = tape.gradient(prediction, attention_weights)

# Check if gradients are None
if gradients is None:
    raise ValueError("Gradients could not be computed. Check if attention_weights are connected to the prediction.")

# Average attention weights across heads
avg_attention_weights = tf.reduce_mean(attention_weights, axis=1)  # Shape: (69159, 31, 31)

# Combine gradients with attention weights to calculate Grad-AAM scores
grad_aam_scores = avg_attention_weights * gradients

# Average Grad-AAM scores across heads
avg_grad_aam_scores = tf.reduce_mean(grad_aam_scores, axis=1).numpy()

# Plot the Grad-AAM scores
plt.figure(figsize=(10, 8))
plt.imshow(avg_grad_aam_scores[0], cmap="viridis", aspect="auto")  # Use 0th sample
plt.colorbar(label="Grad-AAM Score")
plt.title("Grad-AAM Heatmap")
plt.xlabel("Key Timestep")
plt.ylabel("Query Timestep")
plt.show()


# Scores from integrated gradients (IG)

In [None]:
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm

num_features = dataset.Xt.shape[2]
num_instances = dataset.Xt.shape[0]
all_ig_scores = []
all_sensitivity_scores = []

batch_size = 200
batches = int(np.ceil(num_instances / batch_size))  # Ensure we process all instances
num_steps = 50  # Number of steps for the IG approximation

for i in tqdm(range(batches)):
    start = batch_size * i
    end = min(batch_size * (i + 1), num_instances)  # Ensure we do not exceed the array bounds
    dyn = dataset.Xt[start:end, :, :]
    stt = np.nan_to_num(dataset.Xc[start:end], 0)  # Ensure we use all features in stt

    # Baseline input (all zeros)
    bb = np.zeros_like(dyn)
    stt_tensor = tf.convert_to_tensor(stt, dtype=tf.float32)  # Convert stt to tensor

    # Convert inputs to tensors
    dyn_tensor = tf.convert_to_tensor(dyn, dtype=tf.float32)
    bb_tensor = tf.convert_to_tensor(bb, dtype=tf.float32)

    # Calculate Integrated Gradients and Sensitivity
    total_gradients = np.zeros_like(dyn)
    
    for alpha in np.linspace(0, 1, num_steps):
        # Interpolate between the baseline and the input
        interpolated_input = bb_tensor + alpha * (dyn_tensor - bb_tensor)
        
        with tf.GradientTape() as tape:
            tape.watch(interpolated_input)
            output = model([interpolated_input, stt_tensor])
            gradients = tape.gradient(output, interpolated_input)
        
        # Accumulate gradients for each step
        total_gradients += gradients.numpy()

    # Calculate average sensitivity (mean gradient over steps)
    average_sensitivity = total_gradients / num_steps
    
    # Calculate Integrated Gradients for each day (non-cumulative)
    integrated_gradients = (dyn - bb) * average_sensitivity
    
    # Append both results to separate lists
    all_ig_scores.append(integrated_gradients)
    all_sensitivity_scores.append(average_sensitivity)

# Concatenate all scores to form single arrays
all_ig_scores = np.concatenate(all_ig_scores, axis=0)
all_sensitivity_scores = np.concatenate(all_sensitivity_scores, axis=0)

# Calculate cumulative IG scores
#cumulative_ig_scores = np.cumsum(all_ig_scores, axis=0)

# Verify the shapes again
print(f"Shape of dataset.Xt: {dataset.Xt.shape}")
print(f"Shape of all_ig_scores: {all_ig_scores.shape}")
#print(f"Shape of cumulative_ig_scores: {cumulative_ig_scores.shape}")
print(f"Shape of all_sensitivity_scores: {all_sensitivity_scores.shape}")

# Save the computed scores if the shapes match
if all_ig_scores.shape == dataset.Xt.shape and all_sensitivity_scores.shape == dataset.Xt.shape:
    np.save("ExplainedGradients/all_scores_RS_daily_75_IG.npy", all_ig_scores)  # IG scores per day
    np.save("ExplainedGradients/all_scores_RS_daily_75_IG_sensitivity.npy", all_sensitivity_scores)  # Sensitivity scores
    #np.save("ExplainedGradients/all_scores_DF_daily_75_cumulative_IG.npy", cumulative_ig_scores)  # Cumulative IG scores
else:
    print("Mismatch in total number of elements. Check the computation of the scores.")


In [None]:
# Print the values of the scores in all_scores for row 52032
row_index = 52032

# Check if row_index is within the bounds of all_scores
if row_index < all_ig_scores.shape[0]:
    # Print the scores for the specified row
    print(all_ig_scores[row_index])
else:
    print(f"Row index {row_index} is out of bounds. The total number of rows is {all_ig_scores.shape[0]}.")

In [None]:
#IG computation with cumulative dyn and cum-1 baseline
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm

# Assuming dataset.Xt is structured as (num_instances, num_days, num_features)
num_features = dataset.Xt.shape[2]
num_instances = dataset.Xt.shape[0]
num_days = dataset.Xt.shape[1]

all_ig_scores = []

batch_size = 200
batches = int(np.ceil(num_instances / batch_size))
num_steps = 50  # Number of steps for the IG approximation

for i in tqdm(range(batches)):
    start = batch_size * i
    end = min(batch_size * (i + 1), num_instances)
    dyn = dataset.Xt[start:end, :, :]
    stt = np.nan_to_num(dataset.Xc[start:end], 0)

    # Compute cumulative values for `dyn` to create `dyn_cumulative`
    dyn_cumulative = np.cumsum(dyn, axis=1)
    
    # Create the baseline `bb` by shifting cumulative values by one day
    baseline_cumulative = np.zeros_like(dyn_cumulative)
    #baseline_cumulative[:, 1:, :] = dyn_cumulative[:, :-1, :]
    
    # Convert `dyn_cumulative` and `baseline_cumulative` to tensors
    dyn_tensor = tf.convert_to_tensor(dyn_cumulative, dtype=tf.float32)
    bb_tensor = tf.convert_to_tensor(baseline_cumulative, dtype=tf.float32)
    stt_tensor = tf.convert_to_tensor(stt, dtype=tf.float32)

    # Initialize gradient accumulation
    total_gradients = np.zeros_like(dyn_cumulative)

    for alpha in np.linspace(0, 1, num_steps):
        # Interpolate between the baseline and cumulative input values
        interpolated_input = bb_tensor + alpha * (dyn_tensor - bb_tensor)
        print(f"Alpha: {alpha}, Interpolated Input Sample:", interpolated_input[0, :5].numpy())  # Sample output for debug

        with tf.GradientTape() as tape:
            tape.watch(interpolated_input)
            output = model([interpolated_input, stt_tensor])
            print(f"Output Sample (Alpha={alpha}):", output.numpy()[0])  # Sample output for debug
            gradients = tape.gradient(output, interpolated_input)
        
        print("Gradient Sample:", gradients[0, :5].numpy())  # Sample gradients for debug
        
        # Accumulate gradients for each step
        total_gradients += gradients.numpy()

    # Calculate the average gradient and Integrated Gradients for each day
    average_sensitivity = total_gradients / num_steps
    integrated_gradients = (dyn_cumulative - baseline_cumulative) * average_sensitivity

    # Append the IG scores for the batch
    all_ig_scores.append(integrated_gradients)

# Concatenate all IG scores into a single array
all_ig_scores = np.concatenate(all_ig_scores, axis=0)

# Verify the shape
print(f"Shape of dataset.Xt: {dataset.Xt.shape}")
print(f"Shape of all_ig_scores: {all_ig_scores.shape}")

# Save the computed IG scores
if all_ig_scores.shape == dataset.Xt.shape:
    np.save("ExplainedGradients/all_scores_DS_daily_75_IG_cum.npy", all_ig_scores)
else:
    print("Mismatch in total number of elements. Check the computation of IG scores.")

## gradients to obtain the SHAP equivalent (old original proposed method - not working)

In [None]:
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm

num_features = dataset.Xt.shape[2]
num_instances = dataset.Xt.shape[0]
all_scores = []

batch_size = 200
batches = int(np.ceil(num_instances / batch_size))  # Ensure we process all instances

for i in tqdm(range(batches)):
    start = batch_size * i
    end = min(batch_size * (i + 1), num_instances)  # Ensure we do not exceed the array bounds
    dyn = dataset.Xt[start:end, :, :]
    stt = np.nan_to_num(dataset.Xc[start:end], 0)  # Ensure we use all features in stt
    
    bb = tf.convert_to_tensor(np.zeros(dyn.shape), dtype=tf.float32)
    stt_tensor = tf.convert_to_tensor(stt, dtype=tf.float32)  # Convert stt to a tensor with correct shape # turn it 0 to evaluate the influence of the static preds and put the bb as non zer = dyn
     
    with tf.GradientTape() as tape_baseline:
        tape_baseline.watch(bb)
        output_baseline = model([bb, stt_tensor])  # Use the tensor version of stt
        gradients_baseline = tape_baseline.gradient(output_baseline, bb)
    
    xx = tf.convert_to_tensor(dyn, dtype=tf.float32)
    
    with tf.GradientTape() as tape_instance:
        tape_instance.watch(xx)
        output = model([xx, stt_tensor])  # Use the tensor version of stt
        gradients_instances = tape_instance.gradient(output, xx)
    
    scoresv2 = (xx - bb) * gradients_instances / (gradients_instances - gradients_baseline + 1e-10)
    del gradients_instances, gradients_baseline
    all_scores.append(scoresv2.numpy())
    del scoresv2

# Concatenate all scores to form a single array
all_scores = np.concatenate(all_scores, axis=0)

# Verify the shapes again
print(f"Shape of dataset.Xt: {dataset.Xt.shape}")
print(f"Shape of all_scores: {all_scores.shape}")
print(f"Total elements in dataset.Xt: {np.prod(dataset.Xt.shape)}")
print(f"Total elements in all_scores: {np.prod(all_scores.shape)}")

# Reshape and save the computed scores if the shapes match
if all_scores.shape == dataset.Xt.shape:
    all_scores_3d = all_scores.reshape(dataset.Xt.shape)
    np.save(f"ExplainedGradients/all_scores_RS_hourly.npy", all_scores_3d)
else:
    print("Mismatch in total number of elements. Check the computation of all_scores.")


## Gradient from SHAP equivalent for static predictors

In [None]:
import tensorflow as tf
import numpy as np
from tqdm.notebook import tqdm

# Number of static features
num_static_features = dataset.Xc.shape[1]
num_instances = dataset.Xt.shape[0]
all_scores_static = []

batch_size = 200
batches = int(np.ceil(num_instances / batch_size))  # Ensure we process all instances

for i in tqdm(range(batches)):
    start = batch_size * i
    end = min(batch_size * (i + 1), num_instances)  # Ensure we do not exceed the array bounds
    
    # Dynamic features (time series) - keep unchanged
    dyn = dataset.Xt[start:end, :, :]
    
    # Static features - original and baseline (all zeros tensor)
    stt_original = np.nan_to_num(dataset.Xc[start:end], 0)  # Ensure we use all features in stt
    stt_baseline = tf.convert_to_tensor(np.zeros(stt_original.shape), dtype=tf.float32)  # Set to zeros as a baseline
    
    dyn_tensor = tf.convert_to_tensor(dyn, dtype=tf.float32)
    
    with tf.GradientTape() as tape_baseline:
        tape_baseline.watch(stt_baseline)
        output_baseline = model([dyn_tensor, stt_baseline])  # Use the baseline static context
        gradients_baseline = tape_baseline.gradient(output_baseline, stt_baseline)
    
    stt_tensor = tf.convert_to_tensor(stt_original, dtype=tf.float32)

    with tf.GradientTape() as tape_instance:
        tape_instance.watch(stt_tensor)
        output = model([dyn_tensor, stt_tensor])  # Use the original static context
        gradients_instances = tape_instance.gradient(output, stt_tensor)
    
    scores_static = (stt_tensor - stt_baseline) * gradients_instances / (gradients_instances - gradients_baseline + 1e-10)
    del gradients_instances, gradients_baseline
    all_scores_static.append(scores_static.numpy())
    del scores_static

# Concatenate all scores to form a single array
all_scores_static = np.concatenate(all_scores_static, axis=0)

# Verify the shapes again
print(f"Shape of dataset.Xc: {dataset.Xc.shape}")
print(f"Shape of all_scores_static: {all_scores_static.shape}")
print(f"Total elements in dataset.Xc: {np.prod(dataset.Xc.shape)}")
print(f"Total elements in all_scores_static: {np.prod(all_scores_static.shape)}")

# Reshape and save the computed scores if the shapes match
if all_scores_static.shape == dataset.Xc.shape:
    np.save(f"ExplainedGradients/all_scores_static_RS_daily_75.npy", all_scores_static)
else:
    print("Mismatch in total number of elements. Check the computation of all_scores_static.")


## SHAP GE (new method using python function - ok)

In [None]:
import shap

# Combine inputs into a list
inputs = [dataset.Xt, dataset.Xc]

# base inputs

# Create SHAP explainer
#explainer = shap.GradientExplainer(model, inputs)
# Compute SHAP values
#shap_values = explainer.shap_values(inputs)

# Define the explainer with optimized parameters for your setup
explainer = shap.GradientExplainer(
    model,
    inputs,
    batch_size=150,          # Leverage good computational resources
    local_smoothing=0.1    # Slightly increase smoothing for better stability
)

# Compute SHAP values with adjusted settings, including variance output
shap_values, shap_variances = explainer.shap_values(
    inputs,
    nsamples=200,            # Increase samples slightly for better accuracy
    ranked_outputs=None,     # Explain all outputs
    return_variances=True    # Include uncertainty estimates
)

# Check shapes and lengths of SHAP values
print("Shape of SHAP values for Xt (time series):", shap_values[0].shape)
print("Shape of SHAP values for Xc (constant terrain):", shap_values[1].shape)
#print("Shape of SHAP variances for Xt (time series):", shap_variances[0].shape)
#print("Shape of SHAP variances for Xc (constant terrain):", shap_variances[1].shape)

# Save SHAP values and variances for each input separately
np.save("ExplainedGradients/DS_shap_ge_values_Xt.npy", shap_values[0])  # SHAP values for Xt
np.save("ExplainedGradients/DS_shap_ge_values_Xc.npy", shap_values[1])  # SHAP values for Xc
#np.save("ExplainedGradients/DS_shap_ge_variances_Xt.npy", shap_variances[0])  # Variances for Xt
#np.save("ExplainedGradients/DS_shap_ge_variances_Xc.npy", shap_variances[1])  # Variances for Xc

print("SHAP values saved successfully for Xt and Xc.")

# Squeeze SHAP values for Xt (time series) and Xc
shap_values_Xt = np.squeeze(shap_values[0])  # Shape: (69159, 31) after squeezing
shap_values_Xc = np.squeeze(shap_values[1])  # Shape: (69159, 32)

# Compute mean, 10th percentile, and 90th percentile across all rows for each column (Xt)
mean_shap = np.mean(shap_values_Xt, axis=0)  # Mean for each day
p10_shap = np.percentile(shap_values_Xt, 10, axis=0)  # 10th percentile for each day
p90_shap = np.percentile(shap_values_Xt, 90, axis=0)  # 90th percentile for each day

# Plot SHAP values for Xt (time series)
days = np.arange(1, shap_values_Xt.shape[1] + 1)  # Day indices (1 to 31)

plt.figure(figsize=(10, 6))
plt.plot(days, mean_shap, label="Mean SHAP Value", color="blue", linewidth=2)
plt.fill_between(days, p10_shap, p90_shap, color="blue", alpha=0.3, label="10-90 Percentile")

# Customize the plot
plt.title("Mean SHAP Values with 10th and 90th Percentiles Over Time (Xt)")
plt.xlabel("Days")
plt.ylabel("SHAP Values")
plt.legend(loc="upper right")
plt.grid(True)

# Show the first plot
plt.tight_layout()
plt.show()

# Plot SHAP values for Xc (static predictors) as a box plot (without outliers)
plt.figure(figsize=(12, 6))
plt.boxplot(
    shap_values_Xc,
    showmeans=True,
    meanline=True,
    whiskerprops=dict(color="black"),
    boxprops=dict(color="blue", linewidth=2),
    capprops=dict(color="black"),
    flierprops=dict(marker=".", markersize=0),  # Remove outliers by making them invisible
    showfliers=False,  # Explicitly turn off outliers
)

# Customize the box plot
plt.title("SHAP Values for Static Predictors")
plt.xlabel("Static Predictors")
plt.ylabel("SHAP Values")
plt.xticks(
    np.arange(1, shap_values_Xc.shape[1] + 1),
    [f"Feature {i}" for i in range(1, shap_values_Xc.shape[1] + 1)],
    rotation=90,
)

plt.grid(axis="y", linestyle="--", alpha=0.7)

# Show the second plot
plt.tight_layout()
plt.show()


In [None]:
# WITH DEEP EXPLAINER

# The error occurs because TensorFlow cannot compute gradients for custom or unsupported operations (shap_LeakyRelu in this case) in your model when using the DeepExplainer.

# Why This Happens:
# DeepExplainer relies on the TensorFlow backend to compute gradients for all operations in your model. If your model contains unsupported operations (e.g., a custom activation function or operation), TensorFlow raises this LookupError.

# Possible Solutions:
# Use GradientExplainer Instead: Since GradientExplainer uses numerical approximation and doesn't require the gradients to be directly computable, it might avoid this issue. If you don't need DeepExplainer specifically, revert to GradientExplainer.

# python:
# explainer = shap.GradientExplainer(model, inputs)
# shap_values = explainer.shap_values(inputs)

# Replace Unsupported Layers/Activations: If you want to use DeepExplainer, ensure all custom operations are replaced with TensorFlow-supported alternatives. For example:
# Replace custom shap_LeakyRelu with TensorFlow's tf.keras.layers.LeakyReLU.
    
# python:
# from tensorflow.keras.layers import LeakyReLU
# # Replace shap_LeakyRelu in your model definition with LeakyReLU
# model.add(LeakyReLU(alpha=0.1))

# Custom Gradient for Unsupported Operations: If you cannot replace the unsupported operations, you can define custom gradients for those operations. However, this is more advanced and requires modifying the TensorFlow backend.
# Use KernelExplainer or Other SHAP Explainers: If your input data is not large and you can precompute a subset of background data, you can use KernelExplainer, which doesn't rely on gradients:

# python:
# explainer = shap.KernelExplainer(model.predict, background_data)
# shap_values = explainer.shap_values(inputs)

# Check TensorFlow Version Compatibility: Ensure that the TensorFlow version is compatible with SHAP. If you are using an older or newer version of TensorFlow, consider downgrading/upgrading.
# Recommended Path:
# Given the issue with unsupported operations (shap_LeakyRelu), the most straightforward solution is to switch to GradientExplainer or replace unsupported operations with standard TensorFlow operations if you need DeepExplainer

import shap
import numpy as np
import matplotlib.pyplot as plt

# Combine inputs into a list
inputs = [dataset.Xt, dataset.Xc]

# Create SHAP explainer using DeepExplainer
explainer = shap.DeepExplainer(model, inputs)

# Compute SHAP values
shap_values = explainer.shap_values(inputs)

# Check shapes and lengths of SHAP values
print("Shape of SHAP values for Xt (time series):", shap_values[0].shape)
print("Shape of SHAP values for Xc (constant terrain):", shap_values[1].shape)

# Save SHAP values for each input separately
np.save("ExplainedGradients/DS_shap_DE_values_Xt.npy", shap_values[0])  # SHAP values for Xt
np.save("ExplainedGradients/DS_shap_DE_values_Xc.npy", shap_values[1])  # SHAP values for Xc

print("SHAP values saved successfully for Xt and Xc.")

# Squeeze SHAP values for Xt (time series) and Xc
shap_values_Xt = np.squeeze(shap_values[0])  # Shape: (69159, 31) after squeezing
shap_values_Xc = np.squeeze(shap_values[1])  # Shape: (69159, 32)

# Compute mean, 10th percentile, and 90th percentile across all rows for each column (Xt)
mean_shap = np.mean(shap_values_Xt, axis=0)  # Mean for each day
p10_shap = np.percentile(shap_values_Xt, 10, axis=0)  # 10th percentile for each day
p90_shap = np.percentile(shap_values_Xt, 90, axis=0)  # 90th percentile for each day

# Plot SHAP values for Xt (time series)
days = np.arange(1, shap_values_Xt.shape[1] + 1)  # Day indices (1 to 31)

plt.figure(figsize=(10, 6))
plt.plot(days, mean_shap, label="Mean SHAP Value", color="blue", linewidth=2)
plt.fill_between(days, p10_shap, p90_shap, color="blue", alpha=0.3, label="10-90 Percentile")

# Customize the plot
plt.title("Mean SHAP Values with 10th and 90th Percentiles Over Time (Xt)")
plt.xlabel("Days")
plt.ylabel("SHAP Values")
plt.legend(loc="upper right")
plt.grid(True)

# Show the first plot
plt.tight_layout()
plt.show()

# Plot SHAP values for Xc (static predictors) as a box plot (without outliers)
plt.figure(figsize=(12, 6))
plt.boxplot(
    shap_values_Xc,
    showmeans=True,
    meanline=True,
    whiskerprops=dict(color="black"),
    boxprops=dict(color="blue", linewidth=2),
    capprops=dict(color="black"),
    flierprops=dict(marker=".", markersize=0),  # Remove outliers by making them invisible
    showfliers=False,  # Explicitly turn off outliers
)

# Customize the box plot
plt.title("SHAP Values for Static Predictors")
plt.xlabel("Static Predictors")
plt.ylabel("SHAP Values")
plt.xticks(
    np.arange(1, shap_values_Xc.shape[1] + 1),
    [f"Feature {i}" for i in range(1, shap_values_Xc.shape[1] + 1)],
    rotation=90,
)

plt.grid(axis="y", linestyle="--", alpha=0.7)

# Show the second plot
plt.tight_layout()
plt.show()
