<a href="https://colab.research.google.com/github/samaneh-m/TU-simulation-base-inference/blob/main/Untitled30.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install "bayesflow>=2.0"
!pip install tensorflow  # CPU version is fine; GPU optional



In [None]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import bayesflow

INFO:bayesflow:Using backend 'tensorflow'


In [None]:
import numpy as np
import tensorflow as tf
import bayesflow as bf

from bayesflow.types import Shape
from bayesflow.utils import tree_concatenate
from bayesflow.utils.decorators import allow_batch_size

In [None]:
class HiddenStateSimulator:
    def __init__(self, seq_len=50):
        self.seq_len = seq_len
        self.states = ['alpha', 'other']
        self.amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
                            'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

        # Emission probabilities
        self.emissions = {
            'alpha': np.array([
                12, 6, 3, 5, 1, 9, 5, 4, 2, 7,
                12, 6, 3, 4, 2, 5, 4, 1, 3, 6
            ]) / 100,
            'other': np.array([
                6, 5, 5, 6, 2, 5, 3, 9, 3, 5,
                8, 6, 2, 4, 6, 7, 6, 1, 4, 7
            ]) / 100
        }

        # Transition matrix: rows are from-states, columns are to-states
        self.transitions = {
            'alpha': {'alpha': 0.90, 'other': 0.10},
            'other': {'alpha': 0.05, 'other': 0.95}
        }

    def sample(self, batch_shape=1):
        samples = []
        states = []

        for _ in range(batch_shape):
            seq = []
            st_seq = []

            current_state = 'other'
            for _ in range(self.seq_len):
                # Emit amino acid
                probs = self.emissions[current_state]
                aa = np.random.choice(self.amino_acids, p=probs)
                seq.append(aa)
                st_seq.append(current_state)

                # Transition to next state
                next_state = np.random.choice(self.states, p=[
                    self.transitions[current_state]['alpha'],
                    self.transitions[current_state]['other']
                ])
                current_state = next_state

            samples.append(seq)
            states.append(st_seq)

        return {
            'observed_sequence': samples,
            'hidden_states': states
        }

In [None]:
sim = HiddenStateSimulator(seq_len=50)
data = sim.sample(batch_shape=1000)

print("Amino acid sequence:")
print(data['observed_sequence'][0])
print("\nHidden states:")
print(data['hidden_states'][0])
# len(data['observed_sequence'][0])

Amino acid sequence:
['S', 'S', 'L', 'D', 'T', 'A', 'C', 'N', 'S', 'P', 'S', 'V', 'Q', 'K', 'Y', 'T', 'P', 'P', 'F', 'G', 'A', 'P', 'G', 'K', 'C', 'K', 'Q', 'E', 'P', 'I', 'R', 'L', 'P', 'T', 'D', 'V', 'T', 'S', 'V', 'L', 'A', 'Y', 'D', 'F', 'A', 'L', 'R', 'F', 'R', 'L']

Hidden states:
['other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'alpha', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other']


###Mapping data
Mapping our data to make it suitable for BayesFlow algorithm.
We need to map the amino acids into numbers (0,19) and also the states (0,1)

In [None]:
# Map amino acids to integers (0 to 19)
amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
               'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
aa_to_int = {aa: i for i, aa in enumerate(amino_acids)}

# Map states to integers
state_to_int = {'alpha': 1, 'other': 0}

In [None]:
def encode_sequences(data):
    x_encoded = [[aa_to_int[aa] for aa in seq] for seq in data['observed_sequence']]
    theta_encoded = [[state_to_int[state] for state in seq] for seq in data['hidden_states']]
    return np.array(x_encoded), np.array(theta_encoded)

x_train, theta_train = encode_sequences(data)

In [None]:
from tensorflow.keras.utils import to_categorical

x_train_onehot = to_categorical(x_train, num_classes=20)  # shape: (N, seq_len, 20)

In [None]:
N, seq_len, AA = x_train_onehot.shape

# 3. Define a simple simulator that draws from your pre-simulated data
def replay_simulator(batch_size):
    idx = np.random.choice(N, size=batch_size, replace=False)
    return {"hidden_states": theta_train[idx]}, {"observations": x_train_onehot[idx]}

# 4. Create summary and inference (coupling flow) networks
summary_net = bf.networks.TimeSeriesNetwork(
    input_shape=(seq_len, AA),
    summary_variables=["observations"],
    layers=[128, 64],
    rnn_units=64
)

inference_net = bf.networks.CouplingFlow(
    n_parameters=seq_len,
    context_summary=["observations"],
    coupling_layers=6,
    hidden_sizes=[128, 128]
)

In [None]:
class ReplaySimulator:
    def __init__(self, x_data, theta_data):
        self.x = x_data
        self.theta = theta_data
        self.n = theta_data.shape[0]

    def sample(self, batch_size):
        idx = np.random.choice(self.n, size=batch_size, replace=False)
        return {
            "hidden_states": self.theta[idx],
            "observations": self.x[idx]}

In [None]:
simulator = ReplaySimulator(x_train_onehot, theta_train)

In [None]:
workflow = bf.BasicWorkflow(
    inference_network=inference_net,
    summary_network=summary_net,
    inference_variables=["hidden_states"],
    summary_variables=["observations"],
    simulator=simulator
)

In [None]:
history = workflow.fit_online(
    epochs=10,
    batch_size=64,
    num_batches_per_epoch=200
)

INFO:bayesflow:Fitting on dataset instance of OnlineDataset.
INFO:bayesflow:Building on a test batch.


Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 149ms/step - loss: 30.9250
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 167ms/step - loss: -25.7514
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 143ms/step - loss: -45.2053
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 147ms/step - loss: -57.5122
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 147ms/step - loss: -68.2395
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 142ms/step - loss: -77.1832
Epoch 7/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 141ms/step - loss: -87.0690
Epoch 8/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 143ms/step - loss: -93.8766
Epoch 9/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 148ms/step - loss: -100.8535
Epoch 10/10
[1m200/200[0m [32m━━━━━━━━━━━━━

###Making posterior predictions
Making posterior predictions to evaluate the model.

In [None]:
import tensorflow as tf # Ensure tf is imported if not already

# Assuming 'workflow' is your trained BayesFlow workflow
# and 'x_train_onehot', 'theta_train' are your encoded data from earlier

# 1. Generate some new, unseen data for evaluation
num_test_sequences = 100
test_indices = np.random.choice(N, size=num_test_sequences, replace=False)

x_test_observations = x_train_onehot[test_indices]
theta_test_true = theta_train[test_indices]

# Prepare the data in the format expected by the summary network
# This is still a dictionary, as the summary network is designed to handle this.
test_data_for_summary = {"observations": tf.constant(x_test_observations, dtype=tf.float32)}

# --- CORRECTED PART ---
# 2. Get the summarized context from the summary network
#    The summary network takes the dictionary input and outputs a single tensor.
#    You need to explicitly call the summary network on your test observations.
summarized_context = workflow.summary_network(test_data_for_summary)

# 3. Perform posterior inference using the trained inference_network
#    Now, `conditions` will be the actual tensor output by the summary network.
posterior_samples_raw = workflow.inference_network.sample(
    conditions=summarized_context, # Pass the tensor output from the summary network
    batch_shape=500 # Number of samples to draw for each condition
)

# posterior_samples_raw will be a dictionary, e.g., {'hidden_states': ...}
posterior_samples = posterior_samples_raw # Keep the variable name consistent

print(f"Shape of posterior samples: {posterior_samples['hidden_states'].shape}")

ValueError: Exception encountered when calling Sequential.call().

[1mThe structure of `inputs` doesn't match the expected structure.
Expected: keras_tensor
Received: inputs={'observations': 'Tensor(shape=(100, 50, 20))'}[0m

Arguments received by Sequential.call():
  • inputs={'observations': 'tf.Tensor(shape=(100, 50, 20), dtype=float32)'}
  • training=False
  • mask={'observations': 'None'}
  • kwargs=<class 'inspect._empty'>

In [None]:
# 1. Generate a new test sample
test_sample = sim.sample(batch_shape=1)
x_test, theta_test = encode_sequences(test_sample)
x_test_onehot = to_categorical(x_test, num_classes=20)

# 2. Compute summary features using the summary network
summaries = summary_net(x_test_onehot)

# 3. Sample from the posterior using the inference network directly
posterior_samples = workflow.sample(
    batch_size=1,
    n_samples=100,
    observed={"observations": x_test_onehot}
)

# 4. Shape: (n_samples, seq_len)
posterior_array = posterior_samples["hidden_states"]

# 5. Estimate posterior probabilities (P(alpha-helix) = mean over samples)
posterior_probs = np.mean(posterior_array, axis=0)

# 6. Convert to hard predictions
predicted_states = (posterior_probs >= 0.5).astype(int)
true_states = theta_test[0]

TypeError: BasicWorkflow.sample() missing 2 required keyword-only arguments: 'num_samples' and 'conditions'

In [None]:
# Use first sample for now
true_hidden_states = np.array([1 if s == 'alpha' else 0 for s in data['hidden_states'][0]])