In [None]:
# Jupyter Cell 1: Imports and Setup (Updated)
import pygame
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
import math
from collections import deque # Efficient queue for replay buffer
import matplotlib.pyplot as plt
import time
import os

# --- Pygame Initialization ---
# Keep this even if not rendering during training, needed for evaluation cell later
try:
    pygame.init()
    pygame.font.init()
    FONT = pygame.font.SysFont("Arial", 20)
    print("Pygame initialized successfully.")
    PYGAME_INITIALIZED = True
except pygame.error as e:
    print(f"Pygame initialization failed: {e}")
    PYGAME_INITIALIZED = False
    FONT = None

# --- Device Selection (GPU/CPU) ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA not available. Using CPU.")

# --- Configuration / Hyperparameters ---
# Simulation
WIDTH, HEIGHT = 1000, 700
TRACK_COLOR = (100, 100, 100); GRASS_COLOR = (0, 150, 0)
CAR_COLOR = (255, 0, 0); SENSOR_COLOR = (0, 255, 255)
CHECKPOINT_COLOR = (255, 255, 0, 100); OBSTACLE_COLOR = (0, 0, 255)

CAR_WIDTH, CAR_HEIGHT = 15, 30
MAX_SPEED = 7; MIN_SPEED = 0
ACCELERATION = 0.15; BRAKE_DECEL = 0.3; FRICTION = 0.04
STEERING_ANGLE = 10
NUM_SENSORS = 9; SENSOR_RANGE = 180

# RL Agent (DQN)
STATE_SIZE = NUM_SENSORS + 1
ACTION_SIZE = 4
HIDDEN_SIZE = 256
LEARNING_RATE = 0.00025 # Slightly lower learning rate
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.01
# ***********************************************************
# ***** SLOWER EPSILON DECAY *****
# ***********************************************************
# Decay over ~800-900 episodes: 0.01 = 1.0 * (decay^X) => log(0.01) = X*log(decay) => X = log(0.01)/log(decay)
# If X = 800000 steps (e.g. 1000 eps * 800 steps/ep), decay = exp(log(0.01)/800000) approx 0.999994
# Let's try decaying over ~500k steps (might be ~600-700 episodes avg)
EPSILON_DECAY = 0.99999 # Much slower decay per training step
# ***********************************************************
BUFFER_SIZE = 100000
BATCH_SIZE = 64
TARGET_UPDATE_FREQ = 10 # Update target network every N episodes

# Training
NUM_EPISODES = 1000 # Keep 1000, might need more
MAX_STEPS_PER_EPISODE = 2000

# Visualization & Saving
# ***********************************************************
# ***** DISABLE RENDERING DURING TRAINING *****
# ***********************************************************
RENDER_EVERY_N_EPISODES = 0 # Set to 0 to disable rendering during training loop
# ***********************************************************
PLOT_UPDATE_FREQ = 50
MODEL_SAVE_FREQ = 50
MODEL_SAVE_DIR = "dqn_car_models_complex_duel" # New directory
PLOT_SAVE_DIR = "dqn_car_plots_complex_duel"   # New directory

os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(PLOT_SAVE_DIR, exist_ok=True)

print(f"State Size: {STATE_SIZE}, Action Size: {ACTION_SIZE}")
print(f"Using device: {device}")
print(f"Rendering during training: {'Enabled' if RENDER_EVERY_N_EPISODES > 0 else 'DISABLED'}")
print(f"Epsilon decay rate: {EPSILON_DECAY} (per training step)")

In [None]:
# Jupyter Cell 2: Track Definition (Unchanged)

# Define a more complex track with curves
OUTER_TRACK_POINTS = [
    (50, 150), (150, 50), (400, 50), (550, 100), (700, 100), (850, 50),
    (950, 150), (950, 550), (850, 650), (700, 600), (550, 600), (400, 650),
    (150, 650), (50, 550)
]

INNER_TRACK_POINTS = [
    (150, 250), (250, 150), (400, 150), (500, 200), (650, 200), (750, 150),
    (850, 250), (850, 450), (750, 550), (650, 500), (500, 500), (400, 550),
    (250, 550), (150, 450)
]

# Define Static Obstacles (as pygame.Rect objects)
OBSTACLES = [
    pygame.Rect(300, 300, 20, 100), # Vertical obstacle
    pygame.Rect(600, 400, 100, 20), # Horizontal obstacle
    pygame.Rect(700, 200, 50, 50)   # Square obstacle near a turn
]

# Define Checkpoints for the new track
CHECKPOINTS = []
chk_width = 150
chk_height = 10
CHECKPOINTS.append(pygame.Rect(150, 50, chk_width, chk_height + 100)) # 0
CHECKPOINTS.append(pygame.Rect(550, 50, chk_height + 50, chk_width)) # 1
CHECKPOINTS.append(pygame.Rect(950 - chk_height - 100, 250, chk_height, chk_width)) # 2
CHECKPOINTS.append(pygame.Rect(850 - chk_width, 650 - chk_height - 100, chk_width, chk_height)) # 3
CHECKPOINTS.append(pygame.Rect(150, 500, chk_height, chk_width)) # 4
CHECKPOINTS.append(pygame.Rect(50 + 100, 200, chk_height, chk_width)) # 5

# Starting position and angle for the car on the new track
START_POS_X, START_POS_Y = 100, 200 # Start position from previous fix
START_ANGLE = 0

# --- Helper function for line segment intersection (Unchanged) ---
def line_intersection(p1, p2, p3, p4):
    x1, y1 = p1; x2, y2 = p2; x3, y3 = p3; x4, y4 = p4
    den = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
    if den == 0: return None
    t = ((x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)) / den
    u = -((x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)) / den
    if 0 <= t <= 1 and 0 <= u <= 1:
        return (x1 + t * (x2 - x1), y1 + t * (y2 - y1))
    return None

print("Complex track, obstacles, and checkpoints defined.")
print(f"Start Position: ({START_POS_X}, {START_POS_Y})")
if PYGAME_INITIALIZED:
    print(f"Number of checkpoints: {len(CHECKPOINTS)}")
    print(f"Number of obstacles: {len(OBSTACLES)}")

In [None]:
# Jupyter Cell 3: Car Environment Class (Updated Reward)
import shapely.geometry # Optional, requires pip install shapely

class CarEnv:
    def __init__(self, render_mode='human'): # render_mode is now mostly for evaluation
        self.render_mode = render_mode
        self.screen = None
        self.clock = None
        # Initialize screen only if rendering is explicitly requested for this instance
        if self.render_mode == 'human' and PYGAME_INITIALIZED:
            try:
                self.screen = pygame.display.set_mode((WIDTH, HEIGHT))
                pygame.display.set_caption("2D Self-Driving Car Sim - Complex")
                self.clock = pygame.time.Clock()
            except pygame.error as e:
                print(f"Error creating Pygame window in CarEnv: {e}")
                self.screen = None; self.clock = None
                # Don't force mode change, evaluation might still want 'human'
                # if self.render_mode == 'human': self.render_mode = 'none'

        # --- Define track and obstacle segments FIRST ---
        self.track_segments = []
        self.obstacle_segments = []
        self.all_collision_segments = []
        self.outer_polygon = None # For optional Shapely check
        self.inner_polygon = None # For optional Shapely check

        if 'OUTER_TRACK_POINTS' in globals() and 'INNER_TRACK_POINTS' in globals():
            for i in range(len(OUTER_TRACK_POINTS)):
                p1 = OUTER_TRACK_POINTS[i]; p2 = OUTER_TRACK_POINTS[(i + 1) % len(OUTER_TRACK_POINTS)]
                self.track_segments.append((p1, p2))
            for i in range(len(INNER_TRACK_POINTS)):
                p1 = INNER_TRACK_POINTS[i]; p2 = INNER_TRACK_POINTS[(i + 1) % len(INNER_TRACK_POINTS)]
                self.track_segments.append((p1, p2))
            self.all_collision_segments.extend(self.track_segments)
            try: # Optional Shapely polygons
                self.outer_polygon = shapely.geometry.Polygon(OUTER_TRACK_POINTS)
                self.inner_polygon = shapely.geometry.Polygon(INNER_TRACK_POINTS)
            except Exception as e: pass # Ignore if shapely not installed or points invalid
        else: print("Error: Track points not found.")

        if 'OBSTACLES' in globals():
            self.obstacles_rects = OBSTACLES
            for obs_rect in self.obstacles_rects:
                tl, tr, bl, br = obs_rect.topleft, obs_rect.topright, obs_rect.bottomleft, obs_rect.bottomright
                segments = [(tl, tr), (tr, br), (br, bl), (bl, tl)]
                self.obstacle_segments.extend(segments)
            self.all_collision_segments.extend(self.obstacle_segments)
        else: self.obstacles_rects = []

        self.car_rect = None
        self.last_action = None
        self.reset()

        self.checkpoints_passed = set()
        self.last_checkpoint_index = -1

    def reset(self):
        self.car_x, self.car_y = START_POS_X, START_POS_Y
        self.car_angle = START_ANGLE; self.car_speed = 0.0
        if self.car_rect is None: self.car_rect = pygame.Rect(0, 0, CAR_WIDTH, CAR_HEIGHT)
        self.car_rect.center = (self.car_x, self.car_y)
        self.steps_taken = 0; self.total_reward = 0
        self.done = False; self.last_action = None
        self.checkpoints_passed = set(); self.last_checkpoint_index = -1
        return self._get_state()

    def _get_state(self):
        sensor_readings = self._cast_rays()
        normalized_sensors = [dist / SENSOR_RANGE for dist in sensor_readings]
        normalized_speed = self.car_speed / MAX_SPEED if MAX_SPEED != 0 else 0
        return np.array(normalized_sensors + [normalized_speed], dtype=np.float32)

    def _cast_rays(self):
        distances = []
        car_center_x, car_center_y = self.car_x, self.car_y
        angle_step = 180.0 / (NUM_SENSORS - 1) if NUM_SENSORS > 1 else 0
        base_relative_angle = -90
        for i in range(NUM_SENSORS):
            relative_angle = base_relative_angle + (i * angle_step)
            ray_angle_rad = math.radians(self.car_angle + relative_angle)
            end_x = car_center_x + math.cos(ray_angle_rad) * SENSOR_RANGE
            end_y = car_center_y + math.sin(ray_angle_rad) * SENSOR_RANGE
            ray_segment = ((car_center_x, car_center_y), (end_x, end_y))
            min_dist = SENSOR_RANGE
            if hasattr(self, 'all_collision_segments'):
                for segment in self.all_collision_segments:
                    intersection_point = line_intersection(ray_segment[0], ray_segment[1], segment[0], segment[1])
                    if intersection_point:
                        dist = math.dist((car_center_x, car_center_y), intersection_point)
                        if dist < min_dist: min_dist = dist
            distances.append(min_dist)
        return distances

    def _update_physics(self, action):
        steer = 0; acceleration = 0
        if action == 0: acceleration = ACCELERATION
        elif action == 1: acceleration = -BRAKE_DECEL
        elif action == 2: steer = -STEERING_ANGLE
        elif action == 3: steer = STEERING_ANGLE

        if self.car_speed > 0: self.car_speed -= FRICTION
        if abs(self.car_speed) < FRICTION: self.car_speed = 0
        self.car_speed += acceleration
        self.car_speed = max(MIN_SPEED, min(MAX_SPEED, self.car_speed))

        if abs(self.car_speed) > 0.05: self.car_angle += steer
        self.car_angle %= 360

        angle_rad = math.radians(self.car_angle)
        self.car_x += math.cos(angle_rad) * self.car_speed
        self.car_y += math.sin(angle_rad) * self.car_speed
        if self.car_rect is not None: self.car_rect.center = (self.car_x, self.car_y)

    def _check_collision(self):
        if self.car_rect is None: return None
        center_x, center_y = self.car_rect.center
        angle_rad = math.radians(self.car_angle)
        cos_a, sin_a = math.cos(angle_rad), math.sin(angle_rad)
        hw, hh = CAR_WIDTH / 2, CAR_HEIGHT / 2
        relative_corners = [(-hw, -hh), (hw, -hh), (-hw, hh), (hw, hh)]
        corners = [(center_x+rx*cos_a-ry*sin_a, center_y+rx*sin_a+ry*cos_a) for rx,ry in relative_corners]
        car_edges = [(corners[0],corners[1]),(corners[1],corners[3]),(corners[3],corners[2]),(corners[2],corners[0])]

        if hasattr(self, 'obstacle_segments'):
            for car_edge in car_edges:
                for obs_seg in self.obstacle_segments:
                    if line_intersection(car_edge[0], car_edge[1], obs_seg[0], obs_seg[1]): return 'obstacle'
        if hasattr(self, 'track_segments'):
            for car_edge in car_edges:
                for track_seg in self.track_segments:
                    if line_intersection(car_edge[0], car_edge[1], track_seg[0], track_seg[1]): return 'track'
        # Optional Shapely check
        if self.outer_polygon and self.inner_polygon:
             car_center_point = shapely.geometry.Point(self.car_x, self.car_y)
             if not self.outer_polygon.contains(car_center_point) or self.inner_polygon.contains(car_center_point): return 'track'
        return None

    def _calculate_reward(self, collision_type, action):
        reward = 0; self.done = False
        if collision_type == 'obstacle': reward = -150; self.done = True; return reward
        elif collision_type == 'track': reward = -100; self.done = True; return reward

        checkpoint_reward = 0
        current_checkpoint_index = -1
        car_center_tuple = (self.car_x, self.car_y)
        num_checkpoints = len(CHECKPOINTS) if 'CHECKPOINTS' in globals() else 0
        if num_checkpoints > 0:
            for i, chkpt_rect in enumerate(CHECKPOINTS):
                 if isinstance(chkpt_rect, pygame.Rect) and chkpt_rect.collidepoint(car_center_tuple):
                    current_checkpoint_index = i; break
            if current_checkpoint_index != -1:
                expected_next_checkpoint = (self.last_checkpoint_index + 1) % num_checkpoints
                if current_checkpoint_index == expected_next_checkpoint:
                    if current_checkpoint_index not in self.checkpoints_passed:
                        checkpoint_reward = 60
                        self.checkpoints_passed.add(current_checkpoint_index)
                        self.last_checkpoint_index = current_checkpoint_index
                        if len(self.checkpoints_passed) == num_checkpoints:
                            reward += 250; self.checkpoints_passed = set(); self.last_checkpoint_index = -1
                            print(f"Lap Completed! Bonus: +250") # Keep this print for feedback

        speed_reward = self.car_speed * 0.1

        # --- Reward Shaping: Proximity Penalty (TUNED) ---
        proximity_penalty = 0
        sensor_state = self._get_state()[:-1]
        min_sensor_dist = min(sensor_state) * SENSOR_RANGE
        # ***********************************************************
        # ***** ADJUSTED PROXIMITY PENALTY *****
        # ***********************************************************
        # Penalize if closer than ~40% car length, less harsh magnitude
        prox_threshold = CAR_HEIGHT * 0.4
        if min_sensor_dist < prox_threshold:
             proximity_penalty = -0.8 * (1 - (min_sensor_dist / prox_threshold))**2 # Reduced magnitude from -1.5 to -0.8
        # ***********************************************************

        time_penalty = -0.1 # Keep encouraging efficiency
        reward += checkpoint_reward + speed_reward + proximity_penalty + time_penalty
        self.last_action = action
        return reward

    def step(self, action):
        if self.done: return self._get_state(), 0, self.done, {}
        self._update_physics(action)
        collision_type = self._check_collision()
        reward = self._calculate_reward(collision_type, action)
        self.total_reward += reward
        next_state = self._get_state()
        self.steps_taken += 1
        info = {'collision': collision_type}
        if not self.done and self.steps_taken >= MAX_STEPS_PER_EPISODE:
            self.done = True; reward -= 20
        return next_state, reward, self.done, info

    def render(self): # Only called during evaluation now
        if self.render_mode != 'human' or not PYGAME_INITIALIZED or self.screen is None: return True
        try:
            for event in pygame.event.get():
                if event.type == pygame.QUIT: self.close(); return False
        except pygame.error as e: print(f"Pygame event error: {e}"); self.close(); return False

        self.screen.fill(GRASS_COLOR)
        if 'OUTER_TRACK_POINTS' in globals() and 'INNER_TRACK_POINTS' in globals():
            pygame.draw.polygon(self.screen, TRACK_COLOR, OUTER_TRACK_POINTS)
            pygame.draw.polygon(self.screen, GRASS_COLOR, INNER_TRACK_POINTS)
        if hasattr(self, 'obstacles_rects'):
            for obs_rect in self.obstacles_rects: pygame.draw.rect(self.screen, OBSTACLE_COLOR, obs_rect)
        if 'CHECKPOINTS' in globals() and isinstance(CHECKPOINTS, (list, tuple)):
            for i, chkpt_rect in enumerate(CHECKPOINTS):
                 if isinstance(chkpt_rect, pygame.Rect):
                     s = pygame.Surface((chkpt_rect.width, chkpt_rect.height), pygame.SRCALPHA); s.fill(CHECKPOINT_COLOR)
                     self.screen.blit(s, (chkpt_rect.x, chkpt_rect.y))
        if self.car_rect is not None:
            car_surf = pygame.Surface((CAR_HEIGHT, CAR_WIDTH), pygame.SRCALPHA); car_surf.fill(CAR_COLOR)
            pygame.draw.rect(car_surf, (255,255,255), (CAR_HEIGHT*0.7, CAR_WIDTH*0.2, CAR_HEIGHT*0.3, CAR_WIDTH*0.6))
            rot_car = pygame.transform.rotate(car_surf, -self.car_angle)
            n_rect = rot_car.get_rect(center=(self.car_x, self.car_y)); self.screen.blit(rot_car, n_rect.topleft)

        sensor_hit_points = []
        car_center_x, car_center_y = self.car_x, self.car_y
        angle_step = 180.0 / (NUM_SENSORS - 1) if NUM_SENSORS > 1 else 0; base_relative_angle = -90
        if hasattr(self, 'all_collision_segments'):
            for i in range(NUM_SENSORS):
                relative_angle = base_relative_angle + (i * angle_step); ray_angle_rad = math.radians(self.car_angle + relative_angle)
                end_x_far = car_center_x + math.cos(ray_angle_rad) * SENSOR_RANGE; end_y_far = car_center_y + math.sin(ray_angle_rad) * SENSOR_RANGE
                ray_segment = ((car_center_x, car_center_y), (end_x_far, end_y_far)); min_dist = SENSOR_RANGE; closest_hit = None
                for segment in self.all_collision_segments:
                    intersection_point = line_intersection(ray_segment[0], ray_segment[1], segment[0], segment[1])
                    if intersection_point: dist = math.dist((car_center_x, car_center_y), intersection_point);
                    if intersection_point and dist < min_dist: min_dist = dist; closest_hit = intersection_point
                end_x = car_center_x + math.cos(ray_angle_rad) * min_dist; end_y = car_center_y + math.sin(ray_angle_rad) * min_dist
                sensor_hit_points.append(((end_x, end_y), min_dist < SENSOR_RANGE * 0.999))
        for i, (hit_point, did_hit) in enumerate(sensor_hit_points):
            pygame.draw.line(self.screen, SENSOR_COLOR, (car_center_x, car_center_y), hit_point, 1)
            if did_hit: pygame.draw.circle(self.screen, (255, 255, 0), (int(hit_point[0]), int(hit_point[1])), 3)

        if FONT:
            num_chkpts_total = len(CHECKPOINTS) if 'CHECKPOINTS' in globals() else 'N/A'
            info_txt = f"Speed:{self.car_speed:.1f}|Steps:{self.steps_taken}|Ep Reward:{self.total_reward:.1f}"
            chkpt_txt = f"Checkpoints:{len(self.checkpoints_passed)}/{num_chkpts_total}|Last:{self.last_checkpoint_index}"
            s1 = FONT.render(info_txt, True, (255,255,255), (0,0,0)); self.screen.blit(s1, (10, 10))
            s2 = FONT.render(chkpt_txt, True, (255,255,255), (0,0,0)); self.screen.blit(s2, (10, 35))

        try: pygame.display.flip()
        except pygame.error as e: print(f"Pygame flip error: {e}"); self.close(); return False
        if self.clock: self.clock.tick(60)
        return True

    def close(self):
        if self.screen is not None:
            try: pygame.display.quit()
            except pygame.error as e: print(f"Error closing Pygame display: {e}")
            self.screen = None; print("Pygame display quit.")

# --- Test Block (Optional - No changes needed, but won't render unless mode='human') ---
# [ ... Keep the test block from the previous response if desired ... ]
# [ ... It will only show output if you manually change render_mode='human' when creating env ... ]


In [None]:
# Jupyter Cell 4: DQN Agent (Updated Network - Dueling DQN)

# --- Q-Network Definition (Dueling Architecture) ---
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=HIDDEN_SIZE):
        super(QNetwork, self).__init__()
        self.action_size = action_size

        # Shared layers
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)

        # ***********************************************************
        # ***** DUELING DQN ARCHITECTURE *****
        # ***********************************************************
        # State Value stream
        self.fc_value = nn.Linear(hidden_size, hidden_size // 2) # Intermediate layer for value
        self.out_value = nn.Linear(hidden_size // 2, 1)          # Output: single value V(s)

        # Action Advantage stream
        self.fc_advantage = nn.Linear(hidden_size, hidden_size // 2) # Intermediate layer for advantage
        self.out_advantage = nn.Linear(hidden_size // 2, action_size) # Output: advantage A(s, a) for each action
        # ***********************************************************

    def forward(self, state):
        # Shared layers forward pass
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))

        # Dueling streams forward pass
        value_hidden = F.relu(self.fc_value(x))
        value = self.out_value(value_hidden) # V(s)

        advantage_hidden = F.relu(self.fc_advantage(x))
        advantage = self.out_advantage(advantage_hidden) # A(s, a)

        # Combine value and advantage streams to get Q-values
        # Q(s, a) = V(s) + (A(s, a) - mean(A(s, a')))
        # This improves stability by ensuring advantages sum roughly to zero
        q_values = value + (advantage - advantage.mean(dim=-1, keepdim=True))

        return q_values # Output final Q-values for each action

# --- Replay Buffer (Unchanged) ---
class ReplayBuffer:
    def __init__(self, capacity=BUFFER_SIZE):
        self.buffer = deque(maxlen=capacity)
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    def sample(self, batch_size=BATCH_SIZE):
        return random.sample(self.buffer, batch_size)
    def __len__(self): return len(self.buffer)

# --- DQN Agent (Class structure unchanged, uses new QNetwork) ---
class DQNAgent:
    def __init__(self, state_size=STATE_SIZE, action_size=ACTION_SIZE, hidden_size=HIDDEN_SIZE, lr=LEARNING_RATE, gamma=GAMMA, epsilon_start=EPSILON_START, epsilon_end=EPSILON_END, epsilon_decay=EPSILON_DECAY, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, target_update_freq=TARGET_UPDATE_FREQ):
        self.state_size = state_size; self.action_size = action_size
        self.gamma = gamma; self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end; self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size; self.target_update_freq = target_update_freq

        # Uses the new Dueling QNetwork definition
        self.policy_net = QNetwork(state_size, action_size, hidden_size).to(device)
        self.target_net = QNetwork(state_size, action_size, hidden_size).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict()); self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.memory = ReplayBuffer(buffer_size)

    def act(self, state):
        if random.random() <= self.epsilon: return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        self.policy_net.eval()
        with torch.no_grad(): action_values = self.policy_net(state)
        self.policy_net.train()
        return torch.argmax(action_values).item()

    def remember(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done)

    def replay(self):
        if len(self.memory) < self.batch_size: return None
        experiences = self.memory.sample(self.batch_size); batch = list(zip(*experiences))
        states = torch.FloatTensor(np.array(batch[0])).to(device)
        actions = torch.LongTensor(np.array(batch[1])).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(np.array(batch[2])).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(np.array(batch[3])).to(device)
        dones = torch.FloatTensor(np.array(batch[4])).unsqueeze(1).to(device)

        # Double DQN update: Use policy_net to select best action for next state,
        # but use target_net to evaluate that action's Q-value.
        next_actions_policy = self.policy_net(next_states).max(1)[1].unsqueeze(1) # Get actions from policy net
        next_q_values_target = self.target_net(next_states).gather(1, next_actions_policy) # Get Q-values from target net using those actions

        # Original DQN target calculation (commented out for reference)
        # next_q_values_target = self.target_net(next_states).max(1)[0].unsqueeze(1)

        target_q_values = rewards + (self.gamma * next_q_values_target * (1 - dones))
        current_q_values = self.policy_net(states).gather(1, actions)
        loss = F.mse_loss(current_q_values, target_q_values)

        self.optimizer.zero_grad(); loss.backward()
        # Optional: torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0) # Clip norm instead of value
        self.optimizer.step()

        # Epsilon decay after each training step
        if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay
        return loss.item()

    def update_target_network(self):
         self.target_net.load_state_dict(self.policy_net.state_dict())

    def load(self, filename):
        try:
            self.policy_net.load_state_dict(torch.load(filename, map_location=device))
            self.target_net.load_state_dict(self.policy_net.state_dict())
            print(f"Model weights loaded from {filename} to {device}")
        except Exception as e: print(f"Error loading model weights from {filename}: {e}")

    def save(self, filename):
        torch.save(self.policy_net.state_dict(), filename)

# --- Test the Agent (optional) ---
agent = DQNAgent() # Uses updated defaults
print("Dueling DQN Agent created.")
dummy_state = np.random.rand(STATE_SIZE).astype(np.float32)
action = agent.act(dummy_state)
print(f"Agent chose action: {action} (Epsilon: {agent.epsilon:.5f})") # Show more precision for slow decay
agent.remember(dummy_state, action, 1.0, dummy_state, False)
print(f"Memory size: {len(agent.memory)}")

In [None]:
# Jupyter Cell 5: Training Loop (Updated - No Rendering)

# plot_rewards function remains the same as in the previous response
# Ensure it's defined here or in a previous cell
def plot_rewards(episode_rewards_list, losses_list, avg_rewards_list, filename="training_progress.png", save_dir=PLOT_SAVE_DIR):
    if not episode_rewards_list: print("No data to plot."); return
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1) # Rewards
    plt.plot(episode_rewards_list, label='Episode Reward', alpha=0.7)
    plt.plot(avg_rewards_list, label=f'Avg Reward (Last 100)', linewidth=2, color='orange')
    plt.xlabel("Episode"); plt.ylabel("Reward"); plt.title("Episode Rewards")
    plt.legend(); plt.grid(True)
    # Adjust ylim dynamically based on rewards, avoid extreme lows if any
    min_r = np.percentile(episode_rewards_list, 5) if len(episode_rewards_list)>1 else min(episode_rewards_list) - 50
    max_r = np.percentile(episode_rewards_list, 95) if len(episode_rewards_list)>1 else max(episode_rewards_list) + 50
    plt.ylim(bottom=min_r - abs(min_r*0.1), top=max_r + abs(max_r*0.1))

    plt.subplot(1, 2, 2) # Loss
    valid_losses = [l for l in losses_list if l is not None and not math.isnan(l)]
    if valid_losses:
         num_losses = len(valid_losses)
         plt.plot(valid_losses, label='Batch Loss', alpha=0.3, color='lightblue')
         avg_window = min(max(num_losses // 20, 100), 1000) # Adjust window size logic
         if num_losses >= avg_window:
             loss_avg = np.convolve(valid_losses, np.ones(avg_window)/avg_window, mode='valid')
             loss_avg_x = np.arange(avg_window - 1, num_losses)
             plt.plot(loss_avg_x, loss_avg, label=f'Avg Loss ({avg_window} batches)', linewidth=2, color='red')
         plt.xlabel("Training Batch Step"); plt.ylabel("Loss (MSE)"); plt.title("Training Loss")
         plt.yscale('log'); plt.legend(); plt.grid(True)
    else: plt.text(0.5, 0.5, 'No loss data yet', ha='center', va='center')
    plt.suptitle(f"Training Progress ({len(episode_rewards_list)} Episodes)")
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    save_path = os.path.join(save_dir, filename)
    try: plt.savefig(save_path); print(f"Plot saved to {save_path}")
    except Exception as e: print(f"Error saving plot: {e}")
    plt.close()

# --- Main Training Function ---
def train():
    # render_during_training is effectively False now due to RENDER_EVERY_N_EPISODES = 0
    # Create env without human mode unless specifically needed elsewhere
    env = CarEnv(render_mode='none') # Explicitly set to 'none'
    agent = DQNAgent() # Uses updated defaults and Dueling QNetwork

    all_episode_rewards = []; all_average_rewards = []
    all_losses = []; best_avg_reward = -float('inf')
    global_step_counter = 0

    print(f"Starting training for {NUM_EPISODES} episodes...")
    training_start_time = time.time()

    for e in range(NUM_EPISODES):
        state = env.reset()
        episode_reward = 0; episode_losses = []
        start_time_episode = time.time()

        # No rendering check needed here anymore

        for step in range(MAX_STEPS_PER_EPISODE):
            # No rendering call here anymore

            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward

            loss = agent.replay() # Contains epsilon decay now
            if loss is not None:
                all_losses.append(loss); episode_losses.append(loss)
                global_step_counter += 1

            if done: break

        # --- End of Episode ---
        all_episode_rewards.append(episode_reward)
        avg_reward = np.mean(all_episode_rewards[-100:])
        all_average_rewards.append(avg_reward)

        if e % agent.target_update_freq == 0: agent.update_target_network()

        episode_duration = time.time() - start_time_episode
        avg_ep_loss = np.mean(episode_losses) if episode_losses else 0
        print(f"Ep {e+1}/{NUM_EPISODES} | Reward: {episode_reward:.2f} | Avg Reward(100): {avg_reward:.2f} | Epsilon: {agent.epsilon:.5f} | Steps: {step+1} | Avg Loss: {avg_ep_loss:.4f} | Duration: {episode_duration:.1f}s")

        is_best = avg_reward > best_avg_reward
        if is_best:
             best_avg_reward = avg_reward
             print(f"  New best average reward: {avg_reward:.2f}. Saving best model...")
             agent.save(os.path.join(MODEL_SAVE_DIR, "dqn_car_best.pth"))
        if (e + 1) % MODEL_SAVE_FREQ == 0:
             agent.save(os.path.join(MODEL_SAVE_DIR, f"dqn_car_episode_{e+1}.pth"))

        if (e + 1) % PLOT_UPDATE_FREQ == 0:
             plot_rewards(all_episode_rewards, all_losses, all_average_rewards, filename=f"training_progress_ep{e+1}.png")

    total_training_time = time.time() - training_start_time
    print(f"Training finished in {total_training_time / 60:.2f} minutes.")
    # env.close() # No need to close if screen was never created

    agent.save(os.path.join(MODEL_SAVE_DIR, "dqn_car_final.pth"))
    plot_rewards(all_episode_rewards, all_losses, all_average_rewards, filename="training_progress_final.png")
    return all_episode_rewards, all_losses, all_average_rewards

# --- Start Training ---
start_time_total = time.time()
try:
    print(f"Models will be saved in: {MODEL_SAVE_DIR}")
    print(f"Plots will be saved in: {PLOT_SAVE_DIR}")
    rewards_history, losses_history, avg_rewards_history = train()
except NameError as ne: print(f"NameError: {ne}. Make sure all previous cells are executed.")
except Exception as ex: print(f"An error occurred during training: {ex}"); import traceback; traceback.print_exc()
finally:
    end_time_total = time.time()
    print(f"Total script time: {(end_time_total - start_time_total)/60:.2f} minutes")

In [None]:
# Jupyter Cell 6: Evaluation and Visualization (Unchanged)
# Purpose: Load a saved model and watch the agent drive WITH visualization.

def evaluate_agent(model_path, num_episodes=5):
    global PYGAME_INITIALIZED, FONT
    if not PYGAME_INITIALIZED:
         try:
             pygame.init(); pygame.font.init(); FONT = pygame.font.SysFont("Arial", 20)
             print("Pygame re-initialized for evaluation."); PYGAME_INITIALIZED = True
         except pygame.error as e: print(f"Pygame init failed for eval: {e}"); return

    try:
        # IMPORTANT: Create env in 'human' mode for evaluation!
        env = CarEnv(render_mode='human')
    except Exception as e: print(f"Error creating CarEnv for evaluation: {e}"); return

    # Ensure agent parameters match the loaded model's training parameters
    # (STATE_SIZE, ACTION_SIZE, HIDDEN_SIZE should match Cell 1 settings used for training that model)
    agent = DQNAgent(epsilon_start=0.0, epsilon_end=0.0) # No exploration
    agent.load(model_path)
    if hasattr(agent, 'policy_net'): agent.policy_net.eval() # Set model to evaluation mode

    print(f"\n--- Evaluating model: {model_path} ---")
    print(f"Running {num_episodes} episodes with rendering...")
    all_episode_rewards = []; all_episode_steps = []

    for e in range(num_episodes):
        try:
            state = env.reset()
        except Exception as e:
            print(f"Error resetting env in evaluation: {e}"); break # Stop if env fails
        episode_reward = 0; done = False; steps = 0
        print(f"\n--- Starting Evaluation Episode {e+1} ---")

        while not done:
            if not env.render(): # Render and check for quit event
                print("Pygame window closed by user during evaluation.")
                env.close(); return # Stop evaluation

            action = agent.act(state)
            try:
                next_state, reward, done, info = env.step(action)
            except Exception as e:
                 print(f"Error during env.step in evaluation: {e}"); done=True; info={'collision':'error'} # End episode on error
                 next_state = state # Keep state same on error
                 reward = -500 # Penalize error state

            state = next_state; episode_reward += reward; steps += 1
            # time.sleep(0.01) # Optional slowdown

            if done:
                print(f"Episode {e+1} finished: Steps={steps}, Reward={episode_reward:.2f}, Collision={info.get('collision')}")
                all_episode_rewards.append(episode_reward); all_episode_steps.append(steps)
                time.sleep(1) # Pause

    env.close() # Close window after all episodes
    print("\n--- Evaluation Summary ---")
    if all_episode_rewards:
        print(f"Episodes run: {len(all_episode_rewards)}")
        print(f"Avg Reward: {np.mean(all_episode_rewards):.2f} | Min: {np.min(all_episode_rewards):.2f} | Max: {np.max(all_episode_rewards):.2f}")
        print(f"Avg Steps: {np.mean(all_episode_steps):.1f}")
    else: print("No episodes completed.")

# --- Run Evaluation ---
# Choose the model from the NEW save directory
model_to_evaluate = os.path.join(MODEL_SAVE_DIR, "dqn_car_best.pth") # Or _final.pth or specific episode
# model_to_evaluate = os.path.join(MODEL_SAVE_DIR, "dqn_car_final.pth")

if os.path.exists(model_to_evaluate):
    evaluate_agent(model_to_evaluate, num_episodes=5)
else:
    print(f"Model file not found: {model_to_evaluate}")
    print(f"Ensure training ran and saved models to '{MODEL_SAVE_DIR}'.")