In [None]:
!pip install tensorflow numpy matplotlib gym

In [None]:
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
import random
import matplotlib.pyplot as plt

In [None]:
# Define custom traffic signal environment
class TrafficSignalEnv(gym.Env):
    def __init__(self):
        super(TrafficSignalEnv, self).__init__()
        # Define state space
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(3, 3), dtype=np.uint8)
        # Define action space
        self.action_space = gym.spaces.Discrete(4)
        # Initialize state
        self.state = np.zeros((3, 3), dtype=np.uint8)

    def reset(self):
        # Reset state to initial state
        self.state = np.zeros((3, 3), dtype=np.uint8)
        return self.state

    def step(self, action):
        # Update state based on action (simplified for demonstration purposes)
        if action == 0:
            self.state[0][0] += 1
        elif action == 1:
            self.state[0][1] += 1
        elif action == 2:
            self.state[1][0] += 1
        elif action == 3:
            self.state[1][1] += 1

        reward = np.sum(self.state)
        done = False
        info = {}
        return self.state, reward, done, info

In [None]:
# DQN agent
class DQNAgent:
    def __init__(self, state_shape, action_space, learning_rate=0.001, epsilon=1.0, epsilon_decay=0.995, gamma=0.99):
        self.state_shape = state_shape
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.model = self.build_model()
        self.memory = deque(maxlen=1000)

    def build_model(self):
        model = Sequential([
            Flatten(input_shape=self.state_shape),
            Dense(24, activation='relu'),
            Dense(24, activation='relu'),
            Dense(self.action_space.n)
        ])
        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        model.compile(optimizer=optimizer, loss='mse')
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return

        minibatch = random.sample(self.memory, batch_size)
        states = []
        targets = []

        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += np.amax(self.model.predict(next_state[np.newaxis, :])[0])
            target_f = self.model.predict(state[np.newaxis, :])
            target_f[0][action] = target
            states.append(state)
            targets.append(target_f)

        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)

In [None]:
# Create environment and agent
env = TrafficSignalEnv()
agent = DQNAgent(state_shape=env.observation_space.shape, action_space=env.action_space)

In [None]:
# Train agent
num_episodes = 1
batch_size = 1
rewards = []
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = np.random.randint(0, env.action_space.n)  # Random action for demonstration
        next_state, reward, done, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        agent.replay(batch_size)
        state = next_state
        total_reward += reward
    rewards.append(total_reward)
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
# Plotting rewards over episodes
plt.plot(range(1, num_episodes + 1), rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Reward per Episode')
plt.grid(True)
plt.show()