<a href="https://colab.research.google.com/github/samaneh-m/TU-simulation-base-inference/blob/main/Untitled30.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install "bayesflow>=2.0"
!pip install tensorflow  # CPU version is fine; GPU optional



In [13]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import bayesflow

In [10]:
!pip install bayesflow



In [15]:
import numpy as np

from bayesflow.types import Shape
from bayesflow.utils import tree_concatenate
from bayesflow.utils.decorators import allow_batch_size

In [4]:
class HiddenStateSimulator:
    def __init__(self, seq_len=50):
        self.seq_len = seq_len
        self.states = ['alpha', 'other']
        self.amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
                            'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

        # Emission probabilities
        self.emissions = {
            'alpha': np.array([
                12, 6, 3, 5, 1, 9, 5, 4, 2, 7,
                12, 6, 3, 4, 2, 5, 4, 1, 3, 6
            ]) / 100,
            'other': np.array([
                6, 5, 5, 6, 2, 5, 3, 9, 3, 5,
                8, 6, 2, 4, 6, 7, 6, 1, 4, 7
            ]) / 100
        }

        # Transition matrix: rows are from-states, columns are to-states
        self.transitions = {
            'alpha': {'alpha': 0.90, 'other': 0.10},
            'other': {'alpha': 0.05, 'other': 0.95}
        }

    def sample(self, batch_shape=1):
        samples = []
        states = []

        for _ in range(batch_shape):
            seq = []
            st_seq = []

            current_state = 'other'
            for _ in range(self.seq_len):
                # Emit amino acid
                probs = self.emissions[current_state]
                aa = np.random.choice(self.amino_acids, p=probs)
                seq.append(aa)
                st_seq.append(current_state)

                # Transition to next state
                next_state = np.random.choice(self.states, p=[
                    self.transitions[current_state]['alpha'],
                    self.transitions[current_state]['other']
                ])
                current_state = next_state

            samples.append(seq)
            states.append(st_seq)

        return {
            'observed_sequence': samples,
            'hidden_states': states
        }

In [5]:
sim = HiddenStateSimulator(seq_len=50)
data = sim.sample(batch_shape=1000)

print("Amino acid sequence:")
print(data['observed_sequence'][0])
print("\nHidden states:")
print(data['hidden_states'][0])
# len(data['observed_sequence'][0])

Amino acid sequence:
['V', 'K', 'E', 'I', 'D', 'L', 'N', 'Q', 'D', 'S', 'L', 'D', 'S', 'K', 'G', 'P', 'S', 'V', 'E', 'E', 'G', 'P', 'G', 'D', 'G', 'E', 'L', 'V', 'E', 'W', 'S', 'I', 'K', 'M', 'N', 'L', 'S', 'G', 'K', 'T', 'P', 'T', 'D', 'N', 'T', 'E', 'L', 'I', 'S', 'L']

Hidden states:
['other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'alpha', 'alpha', 'alpha', 'alpha', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'alpha', 'alpha', 'alpha', 'alpha']


###Mapping data
Mapping our data to make it suitable for BayesFlow algorithm.
We need to map the amino acids into numbers (0,19) and also the states (0,1)

In [6]:
# Map amino acids to integers (0 to 19)
amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
               'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
aa_to_int = {aa: i for i, aa in enumerate(amino_acids)}

# Map states to integers
state_to_int = {'alpha': 1, 'other': 0}

In [7]:
def encode_sequences(data):
    x_encoded = [[aa_to_int[aa] for aa in seq] for seq in data['observed_sequence']]
    theta_encoded = [[state_to_int[state] for state in seq] for seq in data['hidden_states']]
    return np.array(x_encoded), np.array(theta_encoded)

x_train, theta_train = encode_sequences(data)

In [8]:
from tensorflow.keras.utils import to_categorical

x_train_onehot = to_categorical(x_train, num_classes=20)  # shape: (N, seq_len, 20)

In [16]:
# from tensorflow.keras import layers, models

# def create_summary_net(seq_len, input_dim, output_dim=64):
#     inp = layers.Input(shape=(seq_len, input_dim))
#     x = layers.Bidirectional(layers.LSTM(128))(inp)
#     x = layers.Dense(output_dim, activation='relu')(x)
#     return models.Model(inp, x)

# summary_net = create_summary_net(seq_len=x_train_onehot.shape[1], input_dim=20, output_dim=64)

In [19]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

# 1. Standard imports
import numpy as np
import tensorflow as tf
import bayesflow as bf

N, seq_len, AA = x_train_onehot.shape

# 3. Define a simple simulator that draws from your pre-simulated data
def replay_simulator(batch_size):
    idx = np.random.choice(N, size=batch_size, replace=False)
    return {"hidden_states": theta_train[idx]}, {"observations": x_train_onehot[idx]}

# 4. Create summary and inference (coupling flow) networks
summary_net = bf.networks.TimeSeriesNetwork(
    input_shape=(seq_len, AA),
    summary_variables=["observations"],
    layers=[128, 64],
    rnn_units=64
)

inference_net = bf.networks.CouplingFlow(
    n_parameters=seq_len,
    context_summary=["observations"],
    coupling_layers=6,
    hidden_sizes=[128, 128]
)

# 5. Wrap it all into BasicWorkflow
workflow = bf.BasicWorkflow(
    inference_network=inference_net,
    summary_network=summary_net,
    inference_variables=["hidden_states"],
    summary_variables=["observations"],
    simulator=replay_simulator
)

# 6. Train the workflow
history = workflow.fit_online(
    epochs=10,
    batch_size=64,
    num_batches_per_epoch=200
)


INFO:bayesflow:Fitting on dataset instance of OnlineDataset.
INFO:bayesflow:Building on a test batch.


AttributeError: 'function' object has no attribute 'sample'

In [24]:
class ReplaySimulator:
    def __init__(self, x_data, theta_data):
        self.x = x_data
        self.theta = theta_data
        self.n = theta_data.shape[0]

    def sample(self, batch_size):
        idx = np.random.choice(self.n, size=batch_size, replace=False)
        return {
            "hidden_states": self.theta[idx],
            "observations": self.x[idx]}

In [25]:
simulator = ReplaySimulator(x_train_onehot, theta_train)

In [26]:
workflow = bf.BasicWorkflow(
    inference_network=inference_net,
    summary_network=summary_net,
    inference_variables=["hidden_states"],
    summary_variables=["observations"],
    simulator=simulator
)

In [27]:
history = workflow.fit_online(
    epochs=10,
    batch_size=64,
    num_batches_per_epoch=200
)

INFO:bayesflow:Fitting on dataset instance of OnlineDataset.
INFO:bayesflow:Building on a test batch.


Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 199ms/step - loss: 33.0517
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 149ms/step - loss: -25.6146
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 147ms/step - loss: -45.5819
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 157ms/step - loss: -57.8135
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 154ms/step - loss: -68.8453
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 167ms/step - loss: -78.5484
Epoch 7/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 175ms/step - loss: -89.0275
Epoch 8/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 177ms/step - loss: -97.0033
Epoch 9/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 162ms/step - loss: -102.4441
Epoch 10/10
[1m200/200[0m [32m━━━━━━━━━━━━━