In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

# Step 1: Environment Setup

class BuckConverterEnvironment:
    def __init__(self, dt=1e-5, vin=24, r=10, l=1e-3, c=10e-6, load=10, max_steps=1000):
        # Initialize the environment
        self.dt = dt  # Time step
        self.vin = vin  # Input voltage
        self.r = r  # Load resistance
        self.l = l  # Inductor value
        self.c = c  # Capacitor value
        self.load = load  # Load value
        self.max_steps = max_steps
        
        self.state_dim = 3
        self.action_dim = 1
        self.action_low = np.array([0])
        self.action_high = np.array([1])
        self.state_low = np.array([0, 0, 0])
        self.state_high = np.array([vin, vin, vin])
        
        self.state = np.zeros(self.state_dim)
        self.done = False
        self.step_count = 0
    
    def reset(self):
        # Reset the environment to the initial state
        self.state = np.zeros(self.state_dim)
        self.done = False
        self.step_count = 0
        return self.state
    
    def step(self, duty_cycle):
    # Apply the action (duty cycle) to the converter and observe the next state and reward
        vout_prev, il_prev, vc_prev = self.state

        # Calculate new state variables
        vout = self.vin * duty_cycle
        il = (self.vin - vout) / self.r
        vc = (self.dt / self.c) * (self.load - il) + vc_prev

        # Update state
        self.state = np.array([vout, il, vc])

        # Calculate reward
        reward = self.calculate_reward(vout, il, vc)

        # Increment step count
        self.step_count += 1

        # Terminate the episode if the maximum number of steps is reached
        if self.step_count >= self.max_steps:
            self.done = True

        # Convert the next_state variable to a list of integers
        next_state = list(np.array(self.state))

        return next_state, reward, self.done, {}
    
    def calculate_reward(self, vout, il, vc):
    # Placeholder reward calculation for demonstration
        reward = -np.abs(vout - self.vin) - 0.01 * np.abs(il) - 0.01 * np.abs(vc - self.vin).__float__()
        return reward
    
    def condition_met(self):
        # Terminate the episode if the output voltage is too far from the target
        target_voltage = 12  # Example target voltage
        vout = self.state[0]
        if np.abs(vout - target_voltage) > 1:
            return True
        else:
            return False

# Initialize the environment
env = BuckConverterEnvironment()

# Get the state and action dimensions
state_dim = env.state_dim
action_dim = env.action_dim

# Step 2: Deep Q-Network (DQN) Setup

def create_dqn_model(state_dim, action_dim):
    model = tf.keras.Sequential([
        Dense(32, input_shape=(state_dim,), activation='relu'),
        Dense(32, activation='relu'),
        Dense(action_dim)
    ])
    return model

target_network = create_dqn_model(state_dim, action_dim)
dqn_model = create_dqn_model(state_dim, action_dim)
optimizer = Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.MeanSquaredError()

# Step 3: DQN Training

gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995  # Decay rate for exploration rate
epsilon_min = 0.01  # Minimum exploration rate
batch_size = 64  # Mini-batch size
target_update_interval = 10  # Update interval for the target network

replay_buffer = []

for episode in range(num_episodes):
    state = env.reset()
    done = False
    
    while not done:
        if np.random.rand() < epsilon:
            action = np.random.uniform(env.action_low, env.action_high)
        else:
            q_values = dqn_model.predict(np.expand_dims(state, axis=0))
            action = np.squeeze(q_values)
        
        next_state, reward, done, _ = env.step(action)
        
        replay_buffer.append((state, action, reward, next_state, done))
        
        if len(replay_buffer) > batch_size:
            batch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)
            
            states = np.array(states)
            actions = np.array(actions)
            rewards = np.array(rewards)
            next_states = np.array(next_states)
            
            dones = np.array(dones)
        
            q_values_next = target_network.predict(next_states)
            targets = rewards + gamma * np.max(q_values_next, axis=1) * (1 - dones)
        
            with tf.GradientTape() as tape:
                q_values = dqn_model(states)
                loss = loss_fn(targets, tf.reduce_sum(q_values * actions, axis=1))
            gradients = tape.gradient(loss, dqn_model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, dqn_model.trainable_variables))
        
        if episode % target_update_interval == 0:
            target_network.set_weights(dqn_model.get_weights())
        
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# Step 4: Using the Trained DQN

state = env.reset()
done = False

while not done:
    q_values = dqn_model.predict(np.expand_dims(state, axis=0))
    action = np.squeeze(q_values)
    
    next_state, reward, done, _ = env.step(action)
    
    state = next_state


NameError: ignored