# Imitation Learning

Imitation learning is a paradigm where an agent learns to mimic an **expert's behavior** instead of learning purely from trial-and-error rewards (e.g., reinforcement learning). 

The simplest common approach is called "**behavior cloning**" (BC). It treats the expert's demonstrations as training data samples; train a policy using the expert's input(state) and output(action) pairs via supervised learning. However, BC often fails because the policy is only trained on states the expert visited, so if it ever drifts off that trajectory, it encounters states that the policy doesn't know what to do.

**[DAgger (Dataset Aggregation)](https://arxiv.org/pdf/1011.0686)** is an interactive imitation learning algorithm that deals with the issue above. The key idea is to keep utilizing the expert's knowledge during training and gradually expand the training data to include those "off-course" states. Instead of training the policy by cloning the expert's behavior, DAgger iteratively refines the policy:

1. After initial training on expert data (behavior cloning), execute the policy in the environment. (Policy rollout) 
2. Store all the states the policy visits (it may include "bad" states).
3. Query the expert for the correct action in each of those states. (Expert labeling)
4. Aggregate these new state-action pairs into the training dataset.
5. Update the policy on this expanded dataset (so it knows what to do in new states).
6. Repeat this process.

Although DAgger requires querying the expert online, it improves behavior cloning by training on a dataset that better resembles the observations the trained policy is likely to encounter. [(Reference)](https://imitation.readthedocs.io/en/latest/algorithms/dagger.html#:~:text=DAgger%20,requires%20querying%20the%20expert%20online)

In [1]:
# The following lines are needed when using Windows 
# An error regarding this happens when the program is trying to initialize multiple versions of the libiomp5md.dll library (the Intel OpenMP runtime), which leads to conflicts. 
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import sys
sys.path.append("C:\\Users\\sc3377\\Documents\\balloon-outreach")

In [2]:
import xarray as xr
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import time

from env.balloon_env import BalloonERAEnvironment
from agent.mppi_agent import MPPIAgentWithCostFunction, MPPIAgent
from learning_util import plot_agent_trajectory, run_expert_episode, evaluate_policy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

## Setup Environment and Define our policy

In [3]:
import xarray as xr

ds          = xr.open_dataset("C:\\Users\\sc3377\\Documents\\balloon-outreach\\era5_data.nc", engine="netcdf4")
start_time  = dt.datetime(year=2024, month=7, day=1, hour=0, minute=0)

#This is Ithaca
initial_lat = 42.6
initial_lon = -76.5
initial_alt = 10.0
target_lat = 47
target_lon = -78
target_alt = 12.0
time_step = 120 #120 seconds
max_steps = int(1440/(time_step/60)) #1 day
noise_std = 0.1
horizon=10
num_samples=10
num_iterations=1
env = BalloonERAEnvironment(ds=ds, 
                            start_time=start_time, 
                            initial_lat=initial_lat, 
                            initial_lon=initial_lon, 
                            initial_alt=initial_alt, 
                            target_lat=target_lat, 
                            target_lon=target_lon,
                            target_alt=target_alt, 
                            dt=time_step,
                            viz=False)

class PolicyNet(nn.Module):
    def __init__(self, input_dim=21, hidden_dim=64, output_dim=1):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim*2)
        self.ln1 = nn.LayerNorm(hidden_dim*2)
        self.fc2 = nn.Linear(hidden_dim*2, hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

        # Weight initialization
        nn.init.kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
        nn.init.zeros_(self.fc1.bias)
        nn.init.kaiming_uniform_(self.fc2.weight, nonlinearity='relu')
        nn.init.zeros_(self.fc2.bias)
        nn.init.xavier_uniform_(self.fc3.weight, gain=nn.init.calculate_gain('tanh'))
        nn.init.zeros_(self.fc3.bias)

    def forward(self, x):
        # x shape: (batch_size, input_dim)
        x = self.fc1(x)
        x = self.ln1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.ln2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return F.tanh(x)

# Initialize policy network and optimizer
policy = PolicyNet()
optimizer = optim.Adam(policy.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

## Behavior Cloning to pretrain our policy

In [None]:
# Define expert, run an episode from an expert, and collect its behavior.

expert = MPPIAgentWithCostFunction(target_lat=target_lat, 
                                  target_lon=target_lon, 
                                  target_alt=target_alt, 
                                  num_samples=num_samples, 
                                  noise_std=noise_std, 
                                  num_iterations=num_iterations, 
                                  horizon=horizon, 
                                  objective='target')

# env.wind_field.disable_noise()
env.wind_field.enable_noise(noise_seed=100)
expert_total_reward_list = []
expert_states_list = []
expert_actions_list = []
num_iter = 3
for i in range(num_iter):
    expert_total_reward, expert_states_np, expert_actions_np = run_expert_episode(env, 
                                                                                  expert, 
                                                                                  max_steps=max_steps, 
                                                                                  policy_name=f'expert_noisy_wf(seed=100)_{i+1}')
    expert_total_reward_list.append(expert_total_reward)
    expert_states_list.append(expert_states_np)
    expert_actions_list.append(expert_actions_np)

initial_expert_states_np = np.concatenate(expert_states_list)
initial_expert_actions_np = np.concatenate(expert_actions_list)

print(expert_total_reward_list)
expert_avg_total_reward = sum(expert_total_reward_list)/len(expert_total_reward_list)
print(f"Expert Trajectory reward in average: {expert_avg_total_reward:.2f}")

Step 0: lat: 42.61, lon: -76.50, alt: 10.00
Step 1: lat: 42.63, lon: -76.50, alt: 10.01
Step 2: lat: 42.64, lon: -76.51, alt: 10.03
Step 3: lat: 42.66, lon: -76.51, alt: 10.05
Step 4: lat: 42.68, lon: -76.51, alt: 10.08
Step 5: lat: 42.69, lon: -76.51, alt: 10.12
Step 6: lat: 42.71, lon: -76.51, alt: 10.17
Step 7: lat: 42.73, lon: -76.52, alt: 10.23
Step 8: lat: 42.74, lon: -76.52, alt: 10.31
Step 9: lat: 42.76, lon: -76.52, alt: 10.39
Step 10: lat: 42.78, lon: -76.52, alt: 10.48
Step 11: lat: 42.80, lon: -76.52, alt: 10.58
Step 12: lat: 42.81, lon: -76.53, alt: 10.69
Step 13: lat: 42.83, lon: -76.53, alt: 10.80
Step 14: lat: 42.85, lon: -76.53, alt: 10.92
Step 15: lat: 42.87, lon: -76.53, alt: 11.03
Step 16: lat: 42.89, lon: -76.53, alt: 11.15
Step 17: lat: 42.91, lon: -76.53, alt: 11.27
Step 18: lat: 42.92, lon: -76.54, alt: 11.38
Step 19: lat: 42.94, lon: -76.54, alt: 11.49
Step 20: lat: 42.96, lon: -76.54, alt: 11.60
Step 21: lat: 42.98, lon: -76.54, alt: 11.70
Step 22: lat: 43.00,

### Initial Dataset & DataLoader Construction

In [None]:
# Convert to pytorch tensors
states_tensor = torch.from_numpy(initial_expert_states_np).float()               # shape (N, 21)
actions_tensor = torch.from_numpy(initial_expert_actions_np).float()             # shape (N, 1)

# Construct TensorDataset and DataLoader
dataset = TensorDataset(states_tensor, actions_tensor)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

### Train our policy

In [None]:
# Training Function Using Loader
def train_one_epoch(loader, policy, optimizer, loss_fn):
    policy.train()
    if policy.training:
        print("→ policy is in training mode")
    else:
        print("→ policy is in evaluation mode")
        
    total_loss = 0.0
    for x_batch, y_batch in loader:
        optimizer.zero_grad()
        preds = policy(x_batch)
        loss = loss_fn(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x_batch.size(0)
    avg_loss = total_loss / len(loader.dataset)
    return avg_loss

In [None]:
num_epochs = 10
losses = []
for epoch in range(num_epochs):
    avg_loss = train_one_epoch(loader, policy, optimizer, loss_fn)
    losses.append(avg_loss)
    print(f"Epoch {epoch+1}/num_epochs — Avg Loss: {avg_loss:.6f} \n")

# plot the loss curve from initial training (behavior cloning)
plt.figure(figsize=(6,4))
plt.plot(range(1, len(losses)+1), losses, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Average Training Loss')
plt.title('Behavior Cloning Loss Curve')
plt.grid(True)
plt.show()

## DAgger Iterations

In [None]:
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt

# === DAgger Iterations using DataLoader ===

# Parameters
dagger_iterations       = 5
episodes_per_iteration  = 1
train_epochs_per_iter   = 10
batch_size              = 64

# Start from initial BC dataset
states_np  = initial_expert_states_np.copy()   # (N0, 21)
actions_np = initial_expert_actions_np.copy()  # (N0,)

for it in range(dagger_iterations):
    new_states = []
    new_actions = []

    # 1. Collect new data by rolling out current policy
    for ep in range(episodes_per_iteration):
        start = time.time()
        state = env.reset()
        for step in range(max_steps):
            # Student policy action
            state_tensor = torch.from_numpy(state).float().unsqueeze(0)
            action_pred  = policy(state_tensor).item()

            # Record state & expert correction
            new_states.append(state)
            expert_act = expert.select_action(state, env, step)
            new_actions.append(expert_act)

            # Step environment with student action
            state, _, done, info = env.step(action_pred)
            if done:
                print(f"\nEpisode terminated: {info}")
                break
        end = time.time()
        print(f"Episode {ep+1}/{episodes_per_iteration} in DAgger Iteration {it+1}/{dagger_iterations} is done. \n Time: {end-start:.2f} seconds")
    

    # 2. Append new data to the NumPy arrays
    new_states_np  = np.array(new_states, dtype=np.float32)
    new_actions_np = np.array(new_actions, dtype=np.float32)
    states_np  = np.concatenate([states_np,  new_states_np], axis=0)
    actions_np = np.concatenate([actions_np, new_actions_np], axis=0)

    # 3. Rebuild dataset & loader
    states_tensor  = torch.from_numpy(states_np).float()
    actions_tensor = torch.from_numpy(actions_np).float()
    dataset = TensorDataset(states_tensor, actions_tensor)
    loader  = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # 4. Train on the aggregated dataset
    print(f"\n--- DAgger Iteration {it+1}/{dagger_iterations} Training ---")
    iter_losses = []
    for epoch in range(train_epochs_per_iter):
        avg_loss = train_one_epoch(loader, policy, optimizer, loss_fn)
        iter_losses.append(avg_loss)
        print(f" Iter {it+1} Epoch {epoch+1}/{train_epochs_per_iter} — Avg Loss: {avg_loss:.6f}")

    # 5. Evaluate policy
    env.wind_field.enable_noise(noise_seed=100)
    avg_reward = evaluate_policy(env, 
                                 policy, 
                                 max_steps=max_steps, 
                                 policy_name=f'DAgger_{it+1}_trained_policy_noisy_wf(seed=100)',
                                 expert_avg_total_reward=expert_avg_total_reward
                                 )
    print(f" After DAgger Iter {it+1}, Avg Reward = {avg_reward:.2f}")