<a href="https://colab.research.google.com/github/satyaveer-negi/Colab/blob/main/Buck_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

# Step 1: Environment Setup

class BuckConverterEnvironment:
    def __init__(self, dt=1e-5, vin=24, r=10, l=1e-3, c=10e-6, load=10, max_steps=1000):
        # Initialize the environment with given parameters
        self.dt = dt  # Time step
        self.vin = vin  # Input voltage
        self.r = r  # Load resistance
        self.l = l  # Inductor value
        self.c = c  # Capacitor value
        self.load = load  # Load value
        self.max_steps = max_steps
        
        # Define the state and action space dimensions
        self.state_dim = 3
        self.action_dim = 1
        self.action_low = np.array([0])
        self.action_high = np.array([1])
        self.state_low = np.array([0, 0, 0])
        self.state_high = np.array([vin, vin, vin])
        
        # Initialize the state, done flag, and step count
        self.state = np.zeros(self.state_dim)
        self.done = False
        self.step_count = 0
    
    def reset(self):
        # Reset the environment to the initial state
        self.state = np.zeros(self.state_dim)
        self.done = False
        self.step_count = 0
        return self.state
    
    def step(self, duty_cycle):
        # Perform a single step in the environment given the duty cycle
        
        # Retrieve the previous state values
        vout_prev, il_prev, vc_prev = self.state

        # Calculate the new state values based on the duty cycle
        vout = self.vin * duty_cycle
        il = (self.vin - vout) / self.r
        vc = (self.dt / self.c) * (self.load - il) + vc_prev

        # Update the state with the new values
        self.state = np.array([vout, il, vc])

        # Calculate the reward based on the current state
        reward = self.calculate_reward(vout, il, vc)

        # Increment the step count
        self.step_count += 1

        # Check if the termination condition is met
        if self.step_count >= self.max_steps or self.condition_met():
            self.done = True

        next_state = list(np.array(self.state))

        return next_state, reward, self.done, {}
    
    def calculate_reward(self, vout, il, vc):
        # Calculate the reward based on the current state
        reward = -np.abs(vout - self.vin) - 0.01 * np.abs(il) - 0.01 * np.abs(vc - self.vin)
        return reward
    
    def condition_met(self):
        # Check if the termination condition is met
        target_voltage = 12
        vout = self.state[0]
        if np.abs(vout - target_voltage) > 1:
            return True
        else:
            return False

# Step 2: Deep Q-Network (DQN) Setup

def create_dqn_model(state_dim, action_dim):
    # Create a DQN model with given state and action dimensions
    model = tf.keras.Sequential([
        Dense(32, input_shape=(state_dim,), activation='relu'),  # Input layer
        Dense(32, activation='relu'),  # Hidden layer
        Dense(action_dim)  # Output layer
    ])
    return model

# Create target and main DQN models
target_network = create_dqn_model(state_dim, action_dim)
dqn_model = create_dqn_model(state_dim, action_dim)

# Define the optimizer and loss function for DQN training
optimizer = Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.MeanSquaredError()

# Step 3: DQN Training

gamma = 0.99  # Discount factor
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995  # Decay rate for exploration rate
epsilon_min = 0.01  # Minimum exploration rate
batch_size = 64  # Mini-batch size
target_update_interval = 10  # Update interval for the target network

replay_buffer = []

num_episodes = 1000  # Number of training episodes

for episode in range(num_episodes):
    env = BuckConverterEnvironment()
    state = env.reset()
    done = False
    
    while not done:
        # Exploration vs. Exploitation: Choose action based on epsilon-greedy policy
        if np.random.rand() < epsilon:
            action = np.random.uniform(env.action_low, env.action_high)  # Explore: Choose a random action
        else:
            q_values = dqn_model.predict(np.expand_dims(state, axis=0))
            action = np.squeeze(q_values)  # Exploit: Choose action based on Q-values
        
        # Take a step in the environment based on the chosen action
        next_state, reward, done, _ = env.step(action)
        
        # Add the experience to the replay buffer
        replay_buffer.append((state, action, reward, next_state, done))
        
        if len(replay_buffer) > batch_size:
            # Perform DQN update using a mini-batch of experiences
            batch = random.sample(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)
            
            states = np.array(states)
            actions = np.array(actions)
            rewards = np.array(rewards)
            next_states = np.array(next_states)
            dones = np.array(dones)
        
            # Compute the TD targets for the Q-network update
            q_values_next = target_network.predict(next_states)
            targets = rewards + gamma * np.max(q_values_next, axis=1) * (1 - dones)
        
            with tf.GradientTape() as tape:
                # Compute the Q-values for the current states
                q_values = dqn_model.predict(states)
                actions_one_hot = tf.one_hot(actions.squeeze(), depth=env.action_dim)
                q_values_selected = tf.reduce_sum(q_values * actions_one_hot, axis=1)
                loss = loss_fn(targets, q_values_selected)  # Compute the loss between targets and predicted Q-values
            gradients = tape.gradient(loss, dqn_model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, dqn_model.trainable_variables))  # Apply gradients to update DQN
        
        if episode % target_update_interval == 0:
            # Update target network weights periodically
            target_network.set_weights(dqn_model.get_weights())
        
        state = next_state
    
    # Decay exploration rate after each episode
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

# Step 4: Using the Trained DQN

env = BuckConverterEnvironment()
state = env.reset()
done = False

while not done:
    # Apply trained DQN to the environment
    q_values = dqn_model.predict(np.expand_dims(state, axis=0))
    action = np.squeeze(q_values)
    
    next_state, reward, done, _ = env.step(action)
    
    state = next_state

    # Render the environment
    print(f"State: {state}, Action: {action}, Reward: {reward}")

