In [3]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import os
from irrigation_environment import IrrigationEnv

def animate_training_progression(all_grids, log_interval=1000, save_path="training_progression.gif", label="Training"):    
    all_frames = []
    episode_labels = []    
    for i, episode_frames in enumerate(all_grids):
        episode_num = (i + 1) * log_interval
        for frame in episode_frames:
            all_frames.append(frame)
            episode_labels.append(f"{label} Episode: {episode_num}")
    if len(all_frames) == 0:
        print("No frames found to animate.")
        return
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.axis('off')
    im = ax.imshow(all_frames[0])
    title = ax.set_title(episode_labels[0])
    def update(frame_idx):
        im.set_array(all_frames[frame_idx])
        title.set_text(episode_labels[frame_idx])
        return [im, title]
    print(f"Generating animation with {len(all_frames)} frames...")
    ani = animation.FuncAnimation(
        fig, update, frames=len(all_frames), interval=50, blit=False
    )
    
    try:
        print(f"Saving to {save_path} (this might take a minute)...")
        ani.save(save_path, writer='pillow', fps=15)
        print(f"Success! Saved to {save_path}")
    except Exception as e:
        print(f"Error saving animation: {e}")
    finally:
        plt.close(fig)

def train_agent():
    # --- Hyperparameters ---
    total_episodes = 50000        
    max_steps = 600         
    learning_rate = 0.15          
    discount_rate = 0.99         
    epsilon = 1.0
    max_epsilon = 1.0
    min_epsilon = 0.01
    decay_rate = 0.0005           
    log_interval = 500   
    log_animation = 10000     
    
    env = IrrigationEnv()
    
    action_size = env.action_space.n
    state_size = env.observation_space.n
    qtable = np.zeros((state_size, action_size))
    grids = [] 
    log_filename = "training_log.csv"
    
    with open(log_filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Episode", "Total Reward", "Steps", "Epsilon"])

    print(f"Starting Training: Total - {total_episodes}")
    
    for episode in range(total_episodes):
        state, info = env.reset()
        total_rewards = 0
        terminated = False
        is_logging_episode = (episode + 1) % log_interval == 0
        is_animating = (episode + 1) % log_animation == 0
        episode_grids = []

        for step in range(max_steps):
            action_mask = info["action_mask"]  
            exp_exp_tradeoff = np.random.uniform(0, 1)

            if exp_exp_tradeoff > epsilon:
                # Exploitation
                masked_q_values = qtable[state, :].copy()
                masked_q_values[action_mask == 0] = -np.inf                 
                max_q = np.max(masked_q_values)
                best_actions = np.flatnonzero(masked_q_values == max_q)
                action = np.random.choice(best_actions)
            else:
                # Exploration
                valid_actions = np.flatnonzero(action_mask == 1)
                if len(valid_actions) > 0:
                    action = np.random.choice(valid_actions)
                else:
                    action = env.action_space.sample() 
            
            new_state, reward, terminated, _, info = env.step(action)
            
            next_action_mask = info["action_mask"]
            masked_next_q = qtable[new_state, :].copy()
            masked_next_q[next_action_mask == 0] = -np.inf
            
            target_q = reward + discount_rate * np.max(masked_next_q)
            qtable[state, action] += learning_rate * (target_q - qtable[state, action])
            
            total_rewards += reward
            state = new_state
            
            if is_animating:
                current_grid = env.render()
                episode_grids.append(current_grid)
            if terminated:
                break
                
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        
        if is_logging_episode:
            print(f"Episode {episode + 1}: Reward: {total_rewards:.1f}, Epsilon: {epsilon:.4f}")
            grids.append(episode_grids)
            with open(log_filename, mode="a", newline="") as file:
                writer = csv.writer(file)
                writer.writerow([episode + 1, total_rewards, step, epsilon])
    
    print("Training Finished.")
    return (qtable, grids, log_interval, log_animation)

def validate_agent(qtable, num_episodes=5):
    print("\n--- Starting Validation ---")
    env = IrrigationEnv()
    max_steps = 400
    total_test_rewards = []
    validation_grids = []

    for episode in range(num_episodes):
        state, info = env.reset()
        episode_reward = 0
        terminated = False
        episode_grids = []

        for step in range(max_steps):
            action_mask = info["action_mask"]
            masked_q_values = qtable[state, :].copy()
            masked_q_values[action_mask == 0] = -np.inf
            max_q = np.max(masked_q_values)
            best_actions = np.flatnonzero(masked_q_values == max_q)
            action = np.random.choice(best_actions)
            
            new_state, reward, terminated, _, info = env.step(action)
            episode_reward += reward
            state = new_state
            
            curr_grid = env.render()
            episode_grids.append(curr_grid)
            if terminated:
                break
                
        total_test_rewards.append(episode_reward)
        print(f"Validation Episode {episode + 1}: Total Reward = {episode_reward:.1f}")
        validation_grids.append(episode_grids)
        
    avg_reward = np.mean(total_test_rewards)
    
    return (avg_reward, validation_grids)


qtable, train_grids, log_int, log_animation = train_agent()
animate_training_progression(train_grids, log_interval=log_animation, save_path=f"test-run.gif", label="Test")

validation_total = 0
validation_iteration = 10
for i in range(validation_iteration):
    avg_reward, val_grids = validate_agent(qtable, num_episodes=5)
    animate_training_progression(val_grids, log_interval=1, save_path=f"validation_run-{i}.gif", label="Validation")
    print(f"\nValidation #{i} Finished. Average Reward: {avg_reward:.1f}")
    validation_total += avg_reward
    
validation_avg = validation_total / validation_iteration


Starting Training: Total - 50000
Episode 500: Reward: -1787.5, Epsilon: 0.7814
Episode 1000: Reward: 642.1, Epsilon: 0.6108
Episode 1500: Reward: -1340.5, Epsilon: 0.4779
Episode 2000: Reward: -1388.3, Epsilon: 0.3744
Episode 2500: Reward: -1872.7, Epsilon: 0.2938
Episode 3000: Reward: -1547.5, Epsilon: 0.2310
Episode 3500: Reward: -1338.7, Epsilon: 0.1821
Episode 4000: Reward: -1367.2, Epsilon: 0.1440
Episode 4500: Reward: 203.0, Epsilon: 0.1144
Episode 5000: Reward: 8.2, Epsilon: 0.0913
Episode 5500: Reward: -1164.8, Epsilon: 0.0733
Episode 6000: Reward: -551.3, Epsilon: 0.0593
Episode 6500: Reward: -962.6, Epsilon: 0.0484
Episode 7000: Reward: 816.4, Epsilon: 0.0399
Episode 7500: Reward: 871.8, Epsilon: 0.0333
Episode 8000: Reward: 370.5, Epsilon: 0.0281
Episode 8500: Reward: 907.1, Epsilon: 0.0241
Episode 9000: Reward: -1686.5, Epsilon: 0.0210
Episode 9500: Reward: 500.6, Epsilon: 0.0186
Episode 10000: Reward: 997.2, Epsilon: 0.0167
Episode 10500: Reward: -293.0, Epsilon: 0.0152
Ep