<a href="https://colab.research.google.com/github/sujithh1110/reinforcement-learning/blob/main/lab11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Behavioral Cloning for CartPole (PyTorch) — Run in Google Colab
# If you run in local env with gym already installed you can skip installs.

# --- Install required packages (safe to run in Colab) ---
!pip install --quiet gymnasium==0.28.1 gymnasium[accept-rom-data] torch torchvision

# --- Imports ---
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import random
from collections import deque
import time

# --- Reproducibility ---
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

# --- Hyperparameters ---
ENV_NAME = "CartPole-v1"
NUM_EXPERT_EPISODES = 200          # number of episodes to collect from expert
MAX_STEPS_PER_EPISODE = 500
BATCH_SIZE = 64
LR = 1e-3
NUM_EPOCHS = 25
HIDDEN_SIZE = 64
VALIDATION_SPLIT = 0.1
EVAL_EPISODES = 50

# --- Expert policy ---
# Simple heuristic expert: push in direction of pole angle
def expert_policy(obs):
    # obs: [cart_pos, cart_vel, pole_angle, pole_vel]
    pole_angle = obs[2]
    action = 1 if pole_angle > 0 else 0
    return action

# --- Data collection from expert ---
env = gym.make(ENV_NAME)
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

observations = []
actions = []

print("Collecting expert demonstrations...")
start_time = time.time()
for ep in range(NUM_EXPERT_EPISODES):
    o, _ = env.reset(seed=SEED + ep)
    done = False
    steps = 0
    while not done and steps < MAX_STEPS_PER_EPISODE:
        a = expert_policy(o)
        observations.append(o.copy())
        actions.append(a)
        o, reward, terminated, truncated, _ = env.step(a)
        done = terminated or truncated
        steps += 1
end_time = time.time()
print(f"Collected {len(observations)} state-action pairs in {NUM_EXPERT_EPISODES} episodes ({end_time-start_time:.1f}s).")

observations = np.array(observations, dtype=np.float32)
actions = np.array(actions, dtype=np.int64)

# Shuffle data
perm = np.random.permutation(len(observations))
observations = observations[perm]
actions = actions[perm]

# Split into train / val
val_size = int(len(observations) * VALIDATION_SPLIT)
if val_size == 0:
    train_obs, val_obs = observations, None
    train_act, val_act = actions, None
else:
    train_obs, val_obs = observations[val_size:], observations[:val_size]
    train_act, val_act = actions[val_size:], actions[:val_size]

# Create DataLoaders
train_ds = TensorDataset(torch.from_numpy(train_obs), torch.from_numpy(train_act))
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
if val_obs is not None:
    val_ds = TensorDataset(torch.from_numpy(val_obs), torch.from_numpy(val_act))
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
else:
    val_loader = None

# --- Model (simple MLP classifier) ---
class BCModel(nn.Module):
    def __init__(self, obs_dim, hidden_size, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = BCModel(obs_dim, HIDDEN_SIZE, action_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# --- Training loop ---
print("Training behavioral cloning model...")
for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    total_loss = 0.0
    total = 0
    correct = 0
    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += float(loss.item()) * xb.size(0)
        total += xb.size(0)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == yb).sum().item()
    train_loss = total_loss / total
    train_acc = correct / total
    # Validation
    if val_loader is not None:
        model.eval()
        vtotal = 0
        vcorrect = 0
        vloss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device); yb = yb.to(device)
                logits = model(xb)
                loss = criterion(logits, yb)
                vloss += float(loss.item()) * xb.size(0)
                vtotal += xb.size(0)
                preds = torch.argmax(logits, dim=1)
                vcorrect += (preds == yb).sum().item()
        val_loss = vloss / vtotal
        val_acc = vcorrect / vtotal
        print(f"Epoch {epoch:02d} | Train loss {train_loss:.4f} acc {train_acc:.4f} | Val loss {val_loss:.4f} acc {val_acc:.4f}")
    else:
        print(f"Epoch {epoch:02d} | Train loss {train_loss:.4f} acc {train_acc:.4f}")

# --- Evaluation of cloned policy ---
def run_policy(policy_fn, n_episodes=20, render=False):
    env_eval = gym.make(ENV_NAME)
    returns = []
    lengths = []
    for ep in range(n_episodes):
        o, _ = env_eval.reset(seed=SEED + 1000 + ep)
        done = False
        total_reward = 0.0
        steps = 0
        while not done and steps < MAX_STEPS_PER_EPISODE:
            a = policy_fn(o)
            o, r, terminated, truncated, _ = env_eval.step(int(a))
            done = terminated or truncated
            total_reward += r
            steps += 1
            if render:
                env_eval.render()
        returns.append(total_reward)
        lengths.append(steps)
    env_eval.close()
    return np.array(returns), np.array(lengths)

# Expert evaluation
print("\nEvaluating expert policy...")
expert_returns, expert_lengths = run_policy(expert_policy, n_episodes=EVAL_EPISODES)
print(f"Expert: mean return = {expert_returns.mean():.2f}, std = {expert_returns.std():.2f}, max = {expert_returns.max():.2f}")

# Cloned policy using the trained network
def cloned_policy(obs):
    model.eval()
    with torch.no_grad():
        x = torch.from_numpy(obs.astype(np.float32)).unsqueeze(0).to(device)
        logits = model(x)
        action = int(torch.argmax(logits, dim=1).item())
    return action

print("Evaluating cloned policy...")
cloned_returns, cloned_lengths = run_policy(cloned_policy, n_episodes=EVAL_EPISODES)
print(f"Cloned: mean return = {cloned_returns.mean():.2f}, std = {cloned_returns.std():.2f}, max = {cloned_returns.max():.2f}")

# Basic comparison
print("\nSummary:")
print(f"  Expert avg return over {EVAL_EPISODES} eps: {expert_returns.mean():.2f}")
print(f"  Cloned avg return over {EVAL_EPISODES} eps: {cloned_returns.mean():.2f}")

# --- Save model (optional) ---
torch.save(model.state_dict(), "bc_cartpole_model.pth")
print("\nModel saved to bc_cartpole_model.pth")

# Show a few sample predictions (for debugging/visual check)
print("\nSample states -> expert_action, cloned_action")
for i in range(8):
    s = observations[i]
    e_a = expert_policy(s)
    c_a = cloned_policy(s)
    print(f"{np.round(s,3)} -> expert: {e_a}, cloned: {c_a}")

print("\nDone. If you want to visualize the cloned policy run:\n"
      "  - set render=True in run_policy(cloned_policy, n_episodes=1, render=True)\n"
      "  - or use gym's video recorder in Colab (requires extra setup).")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.5/925.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dopamine-rl 4.1.2 requires gymnasium>=1.0.0, but you have gymnasium 0.28.1 which is incompatible.[0m[31m
[0mCollecting expert demonstrations...
Collected 8569 state-action pairs in 200 episodes (0.1s).
Using device: cpu
Training behavioral cloning model...
Epoch 01 | Train loss 0.5408 acc 0.8097 | Val loss 0.2839 acc 0.9357
Epoch 02 | Train loss 0.1802 acc 0.9520 | Val loss 0.1301 acc 0.9556
Epoch 03 | Train loss 0.1113 acc 0.9624 | Val loss 0.1103 acc 0.9556
Epoch 04 | Train loss 0.0867 acc 0.9704 | Val loss 0.0859 acc 0.9685
Epoch 05 | Train loss 0.0719 acc 0.9746 | Val loss 0.0706 acc 0.9708
Epoch 06 | Train loss 0.0627 acc 0.9776 | Val loss 0.0636 acc 0.9708
Epoch 07 | Train lo