<a href="https://colab.research.google.com/github/sujithh1110/reinforcement-learning/blob/main/lab06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install gymnasium torch




In [12]:
import gymnasium as gym
import torch, random, numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import deque, namedtuple

# --- Hyperparameters ---
ENV_ID = "CartPole-v1"
SEED = 0
EPISODES = 200
GAMMA = 0.99
LR = 1e-3
BATCH_SIZE = 64
BUFFER_SIZE = 10000
EPS_START, EPS_END, EPS_DECAY = 1.0, 0.01, 500
TARGET_UPDATE = 10

# --- Setup ---
env = gym.make(ENV_ID)
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Transition = namedtuple('Transition', ('s','a','r','ns','d'))

# --- Replay Buffer ---
class ReplayBuffer:
    def __init__(self, cap): self.buf=deque(maxlen=cap)
    def push(self,*args): self.buf.append(Transition(*args))
    def sample(self,n):
        batch=random.sample(self.buf,n)
        s,a,r,ns,d=map(np.array,zip(*batch))
        return (torch.tensor(s,dtype=torch.float32,device=device),
                torch.tensor(a,dtype=torch.long,device=device).unsqueeze(1),
                torch.tensor(r,dtype=torch.float32,device=device).unsqueeze(1),
                torch.tensor(ns,dtype=torch.float32,device=device),
                torch.tensor(d,dtype=torch.float32,device=device).unsqueeze(1))
    def __len__(self): return len(self.buf)

# --- Q-Network ---
class QNet(nn.Module):
    def __init__(self,n_obs,n_act):
        super().__init__()
        self.fc1=nn.Linear(n_obs,128)
        self.fc2=nn.Linear(128,128)
        self.fc3=nn.Linear(128,n_act)
    def forward(self,x):
        x=F.relu(self.fc1(x)); x=F.relu(self.fc2(x))
        return self.fc3(x)

n_obs=env.observation_space.shape[0]; n_act=env.action_space.n
policy_net, target_net = QNet(n_obs,n_act).to(device), QNet(n_obs,n_act).to(device)
target_net.load_state_dict(policy_net.state_dict())
optimizer=optim.Adam(policy_net.parameters(),lr=LR)
buffer=ReplayBuffer(BUFFER_SIZE)

# --- Epsilon schedule ---
def epsilon_by_step(step):
    return EPS_END+(EPS_START-EPS_END)*np.exp(-1.*step/EPS_DECAY)

# --- Training ---
steps=0
for ep in range(EPISODES):
    s,_=env.reset(seed=SEED); ep_r=0
    done=False
    while not done:
        steps+=1
        eps=epsilon_by_step(steps)
        if random.random()<eps: a=env.action_space.sample()
        else:
            with torch.no_grad():
                a=policy_net(torch.tensor(s,dtype=torch.float32,device=device).unsqueeze(0)).argmax().item()
        ns,r,terminated,truncated,_=env.step(a)
        done=terminated or truncated
        buffer.push(s,a,r,ns,done); s=ns; ep_r+=r

        # learn
        if len(buffer)>=BATCH_SIZE:
            bs,ba,br,bns,bd=buffer.sample(BATCH_SIZE)
            qv=policy_net(bs).gather(1,ba)
            nv=target_net(bns).max(1)[0].detach().unsqueeze(1)
            target=br+GAMMA*nv*(1-bd)
            loss=F.smooth_l1_loss(qv,target)
            optimizer.zero_grad(); loss.backward(); optimizer.step()

        if steps%TARGET_UPDATE==0: target_net.load_state_dict(policy_net.state_dict())
    print(f"Episode {ep}: return={ep_r}")
env.close()


Episode 0: return=18.0
Episode 1: return=32.0
Episode 2: return=32.0
Episode 3: return=11.0
Episode 4: return=14.0
Episode 5: return=35.0
Episode 6: return=15.0
Episode 7: return=49.0
Episode 8: return=14.0
Episode 9: return=40.0
Episode 10: return=15.0
Episode 11: return=24.0
Episode 12: return=21.0
Episode 13: return=16.0
Episode 14: return=13.0
Episode 15: return=17.0
Episode 16: return=18.0
Episode 17: return=20.0
Episode 18: return=15.0
Episode 19: return=13.0
Episode 20: return=15.0
Episode 21: return=19.0
Episode 22: return=14.0
Episode 23: return=20.0
Episode 24: return=20.0
Episode 25: return=20.0
Episode 26: return=11.0
Episode 27: return=13.0
Episode 28: return=12.0
Episode 29: return=12.0
Episode 30: return=12.0
Episode 31: return=12.0
Episode 32: return=12.0
Episode 33: return=11.0
Episode 34: return=17.0
Episode 35: return=12.0
Episode 36: return=15.0
Episode 37: return=11.0
Episode 38: return=12.0
Episode 39: return=13.0
Episode 40: return=13.0
Episode 41: return=11.0
Ep