# Phase 0: Setup & Sanity Check

This notebook implements the minimal end-to-end loop for the Language-As-Memory Continual RL project.

**Goals:**
1. Setup Environment (install dependencies).
2. Load Config & Components (Env, Planner, Agent).
3. Run a single episode loop: State -> LLM -> Subgoal -> PPO Agent.

**Note:** Ensure you are running on a GPU runtime (T4) if you want to test the real Phi-2 model. Otherwise, the system will default to `MockPlanner`.

In [None]:
# @title 1. Setup & Installation
# Uncomment and run if you are running this in Colab and haven't cloned the repo yet.

# !git clone https://github.com/YOUR_USERNAME/language-as-memory-continual-rl.git
# %cd language-as-memory-continual-rl

!pip install -r requirements.txt
!pip install gymnasium minigrid stable-baselines3 transformers accelerate bitsandbytes peft trl tensorboard pyyaml

In [None]:
# @title 2. Imports & Configuration
import sys
import os
import yaml
import gymnasium as gym
import torch

# Ensure src is in path
sys.path.append(os.path.abspath("."))

from src.utils.logger import ExperimentLogger
from src.utils.subgoal_parser import parse_subgoal
from src.llm.planner import get_planner
from src.envs.wrappers import SubgoalWrapper
from src.ppo.sb3_agent import create_agent
from src.utils.seeding import set_global_seeds

# Load Config
with open("src/configs/default_config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Set Global Seeds
set_global_seeds(config['experiment']['seed'])

print(f"Config Loaded. Device: {config['experiment']['device']}")
print(f"Environment: {config['env']['id']}")

In [None]:
# @title 3. Initialize Components

# 1. Environment
env = gym.make(config['env']['id'], render_mode="rgb_array")
env = SubgoalWrapper(env)

# 2. Planner (LLM or Mock)
# This will load Phi-2 bitsandbytes if GPU is available, otherwise MockPlanner
planner = get_planner(config)

# 3. PPO Agent
agent = create_agent(env, config)

print("Components Initialized.")

In [None]:
# @title 3.5 Train Agent (Bootstrap)
# Train the agent briefly so it learns to follow basic commands.
# We set a default subgoal 'Pick up key' or 'Go to goal' to train a basic skill.

# Set subgoal for training: Pick up the key (ID 2)
env.set_subgoal(("pick", "yellow", "key"), 2)

print(f"Training agent for {config['rl']['total_timesteps']} steps on 'Pick up key'...")
agent.learn(total_timesteps=config['rl']['total_timesteps'])
print("Training Complete.")

In [None]:
# @title 4. Execution Loop (Single Episode)

obs, info = env.reset(seed=config['experiment']['seed'])
state_text = env.get_text_description()
print(f"[State]: {state_text}")

# Generate Subgoal
print("Querying Planner...")
subgoal_text = planner.generate_subgoal(state_text)
print(f"[Planner Output]: {subgoal_text}")

# Parse Subgoal
subgoal_tuple, subgoal_id = parse_subgoal(subgoal_text)
print(f"[Parsed]: {subgoal_tuple} -> ID: {subgoal_id}")

# Update Environment with Subgoal
env.set_subgoal(subgoal_tuple, subgoal_id)

# Execute Agent Steps
max_steps = 100 # Increased from 20 to give time to reach target
total_reward = 0

print(f"Executing agent for up to {max_steps} steps...")

for step in range(max_steps):
    action, _ = agent.predict(obs, deterministic=False)
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    
    if info.get('subgoal_completed', False):
        print(f"Subgoal completed at step {step+1}!")

    if terminated or truncated:
        print(f"Episode finished at step {step+1}. Total Reward: {total_reward}")
        break
else:
    print(f"Max steps reached. Total Reward: {total_reward}")