# Training for Generic RL agents in OpenAI Gym environments
#### Stachenfeld lab 
##### update July 2024

## Section 1: Imports

In [3]:
############## generic imports ##################

# Data manipulation and analysis
import numpy as np  # Importing numpy for numerical operations and working with arrays.

# Data visualization
import matplotlib.pyplot as plt  # Importing matplotlib for plotting and data visualization.
import matplotlib.gridspec as gridspec  # Importing gridspec for creating grid layouts.
import matplotlib.patches as mpatches  # Importing patches module for drawing shapes.
from mpl_toolkits.mplot3d import Axes3D  # Importing 3D plotting tools.
from matplotlib import cm  # Importing cm (colormaps) module.

# Open AI gym for the environment
import gymnasium as gym

# Statistics and mathematical operations
import scipy  # Importing scipy for scientific computing.
from scipy import stats, integrate  # Importing stats and integrate modules from scipy.
from scipy.stats import mode, pearsonr, ttest_rel  # Importing mode, Pearson's correlation, and t-test for related samples.

# Other useful libraries
from collections import defaultdict  # Importing defaultdict for creating dictionaries with default values.
from copy import copy  # Importing copy function for creating shallow copies.
import operator  # Importing operator module for standard operators as functions.


###  environments 

1. **Initialize Environment**
    - Create environment using the provided environment name
    - All environments are built with AI Gym
    - The import section should have the following structure:  

    > ```python
    from task_file import Task
    task_env = Task(task_params)
    ```

In [4]:
############### import environment #####################
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset(seed=42)

# sanity check the environment 
def env_preview(env):
    env.reset()
    for dummy in range(100):
        env.render()
        state, reward, done, info = env.step(env.action_space.sample())
        if done:
            break
    env.close()

def show_action_and_env_space(env):
    # Action space and environment space
    print("env.action_space", env.action_space)
    print("env.observation_space", env.observation_space)
    print("env.observation_space.high", env.observation_space.high)
    print("env.observation_space.low", env.observation_space.low)

In [None]:
############### import model ############################

from agent_file import Agent  #  replace with the actual agent 
agent = Agent(task_env, epsilon=0.1, step_size=0.1, discount_factor=0.99)



2. **Initialize Results Containers**
    - Initialize lists to hold rewards per episode
    - Initialize lists to hold steps per episode

3. **Training Loop (for each episode)**
    - Reset environment and initialize state
    - Initialize total_reward and steps counters
    - Set done flag to False

    4. **Episode Loop (while not done and steps < max_steps_per_episode)**
        - Select action using the agent's policy
        - Perform action in the environment
        - Observe next_state, reward, and done flag
        - Update agent with current state, action, reward, and next state
        - Update state to next_state
        - Accumulate reward to total_reward
        - Increment steps counter

    5. **End of Episode**
        - Append total_reward to rewards_per_episode list
        - Append steps to steps_per_episode list
        - Print episode summary (optional)

6. **Return Results**
    - Return rewards_per_episode, steps_per_episode, and final Q-values of the agent


## Section 2: Define Training Loop

In [None]:
def train_model (task_environment, model, algorithm, num_episodes):
    #loop over episodes 
        # observe one state of environment 
        # pass it to model
        # model makes choice via algorithm 
        # algorithm determines loss 
        # model update 

In [None]:
### an example? sepcific to sarsa tryna figure out how to generalize 

def train_tabular_agent(agent, env_name, num_episodes=1000, max_steps_per_episode=100):
    # Initialize the environment
    env = gym.make(env_name)
    
    # Initialize lists to hold results
    rewards_per_episode = []
    steps_per_episode = []

    for episode in range(num_episodes):
        state = agent.initialize_episode(env)
        total_reward = 0
        steps = 0
        done = False

        while not done and steps < max_steps_per_episode:
            action = agent.select_action(state)
            next_state, reward, done, _, _ = env.step(action)
            agent.update(state, action, reward, next_state)
            state = next_state
            total_reward += reward
            steps += 1

        rewards_per_episode.append(total_reward)
        steps_per_episode.append(steps)
        print(f"Episode {episode + 1}: Total Reward = {total_reward}, Steps = {steps}")

    return rewards_per_episode, steps_per_episode, agent.q_values


## Section 3: Visualization

In [None]:
# plot something ? maybe loss over time 