## Sarsa

In [None]:
import numpy as np
import random
from racetrack_env import RacetrackEnv
import matplotlib.pyplot as plt

class Agent:
    def __init__(self, env, alpha, gamma, epsilon, steps):
        # parameters
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.steps = steps
        
        #non-parameters
        self.action_space = len(env.get_actions())
        self.state_space = []

        
        
    
    def initialise(self):
        
        #state : list of probability of taking action == index
        policy = {}
        #state : list of Q values by action == index
        Q = {}
        #episode: sum reward of episode
        reward_dict = {}
        for itr in range(self.steps):
            reward_dict[itr]=0
        
        
        
        x , y = len(self.env.track), len(self.env.track[0])
        vel = range(-10,11)
        
        # arbitrary initialisation of main variables 
        for x in range(x+1):
            for y in range(y+1): 
                for x_vel in vel:
                    for y_vel in vel:
                        state = (x,y,x_vel,y_vel)
                        
                        policy[state] = [1/self.action_space] * self.action_space
                        Q[state]      = [0] * self.action_space

                            
                            
        return policy, Q, reward_dict
    
    
    
    def sarsa(self):
        
        
        #initalise policy and Q-table
        policy, Q, reward_dict = self.initialise()
        
        #Loop for each episode
        for itr in range(self.steps):
            
            
            #initialise S
            state = self.env.reset()
            
            #choose A from S using Q - e-greedy
            action = np.random.choice(self.action_space, p = policy[state])
            
            #list to sum rewards per episode
            episode_reward = []
            
            #Loop for each step of episode
            while True:
                
                #take action A, observe R, S'
                next_state, reward, terminal = self.env.step(action)

                #append reward per episode to list for plot
                episode_reward.append(reward)
                
                #choose A' from S' using policy from Q - takes e-greedy action
                next_action = np.random.choice(self.action_space, p = policy[next_state])
                
                #Update Q value according to SARSA 
                Q[state][action] = Q[state][action] + (self.alpha*((reward + self.gamma*Q[next_state][next_action]) - (Q[state][action])))
            
                
                
                #update policy
                optimal = Q[state].index(max(Q[state]))
                                                
                for a in range(self.action_space):
                    if a == optimal:
                        policy[state][a] = (1-self.epsilon) + (self.epsilon/self.action_space)
                    else:
                        policy[state][a] = (self.epsilon/self.action_space)
                
                
                #update state, action pair
                state  = next_state
                action = next_action 
                
                if terminal:
                    break
            
            
            #undiscounted return per episode
            reward_dict[itr] = sum(episode_reward)
            
            
        return reward_dict
    
def agent_avrg(agents, steps, alpha, gamma, epsilon):
       
    #initialise average undiscounted return among x agents
    avrg_undiscounted_return = {}
    for itr in range(steps):
        avrg_undiscounted_return[itr]=0
        
    env = RacetrackEnv()
    for i in range(agents):
        env.reset()
        
        a = Agent(env, alpha, gamma, epsilon, steps)
        undiscounted_return = a.sarsa()
        
        for ep in avrg_undiscounted_return:
            avrg_undiscounted_return[ep]+= undiscounted_return[ep]
    
    for ep in avrg_undiscounted_return:
        avrg_undiscounted_return[ep] = avrg_undiscounted_return[ep]/agents
                    
    return avrg_undiscounted_return
          

    
    
    
#train
#agents, steps, alpha, gamma, epsilon
x = agent_avrg(20, 150, 0.2, 0.9, 0.15)

#plot
ep = list(x.keys())
reward_sarsa = list(x.values())
plt.plot(ep, reward_sarsa)
plt.title("20 Agents - Sarsa")
plt.ylabel("Undiscounted Return")     
plt.xlabel("Episode");