## On Policy Monte Carlo Control

In [None]:
import numpy as np
from racetrack_env import RacetrackEnv
import matplotlib.pyplot as plt


class Agent:
    def __init__(self, env, gamma, epsilon, steps):
        # parameters
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.steps = steps
        
        #non-parameters
        self.action_space = len(env.get_actions())
        self.state_space = []

        
        
    
    def initialise(self):
        
        #state : list of probability of taking action == index
        policy = {}
        #state : list of Q values by action == index
        Q = {}
        #(state, action) : return reward
        returns = {}
        #(state, action) : count of state visited 
        count = {}
    
        x , y = len(self.env.track), len(self.env.track[0])
        vel = range(-10,11)
        
        # arbitrary initialisation of main variables 
        for x in range(x+1):
            for y in range(y+1): 
                for x_vel in vel:
                    for y_vel in vel:
                        state = (x,y,x_vel,y_vel)
                        
                        policy[state] = [1/self.action_space] * self.action_space
                        Q[state]      = [0] * self.action_space
                        
                        for action in range(self.action_space):
                            returns[state, action] = 0
                            count[state, action]   = 0
        
        
    #episode: sum reward of episode
        reward_dict = {}
        for itr in range(self.steps):
            reward_dict[itr]=0                    
            

                            
        return policy, Q, returns, count, reward_dict

        
      
    def mc(self):

        #initalise variables
        policy, Q, returns, count, reward_dict = self.initialise()

    
        #Loop for each episode   
        for itr in range(self.steps):
                        
            
            #generate episode
            episode = self.run_episode(policy)
            G = 0 
            s_a_pair_visited = [(s,a) for (s,a,r) in episode]



            
            #append reward for episode to list for plot
            #sum return per episode == undiscounted return
            episode_reward=[]
            for (s, a, r) in episode:
                episode_reward.append(r)  
            reward_dict[itr]= sum(episode_reward)

            
 
            for time_step, (s, a, r) in enumerate(episode):
                
                G = self.gamma*G + r

                #first-visit
                if (s, a) not in s_a_pair_visited[0:time_step]:
                        
                        #update Q table to find best action
                        returns[(s,a)] +=G
                        count[(s,a)]+=1
                        Q[s][a] = returns[(s,a)] / count[(s,a)]    
                        optimal = Q[s].index(max(Q[s]))
                                                
                        
                        #update policy 
                        for a in range(self.action_space):
                            if a == optimal:
                                policy[s][a] = (1-self.epsilon) + (self.epsilon/self.action_space)
                            else:
                                policy[s][a] = (self.epsilon/self.action_space)
                             

                        
        return reward_dict
                        
            
            
    #returns list of explored s,a,r until episode ends == (goal state reached in track)
    def run_episode(self, policy):
        state = self.env.reset()
        episode = []
        while True: 
            #take e-greedy action
            action = np.random.choice(self.action_space, p = policy[state])
            next_state, reward, terminal = self.env.step(action)
            episode.append((state, action, reward))
            state = next_state
            if terminal:
                break
        return episode 
        



        
        
def agent_avrg(agents, steps, gamma, epsilon):
       
    #initialise average undiscounted return among x agents
    #episode : return
    avrg_undiscounted_return = {}
    for itr in range(steps):
        avrg_undiscounted_return[itr]=0
        
    env = RacetrackEnv()
    for i in range(agents):
        env.reset()
        
        #gamma, epsilon, steps
        a = Agent(env, gamma, epsilon, steps)
        undiscounted_return = a.mc()
        
        for ep in avrg_undiscounted_return:
            avrg_undiscounted_return[ep]+= undiscounted_return[ep]
    
    for ep in avrg_undiscounted_return:
        avrg_undiscounted_return[ep] = avrg_undiscounted_return[ep]/agents
                    
    return avrg_undiscounted_return
                
    
    
    
    
    
#train
#num of agents, steps
x = agent_avrg(20, 150, 0.9, 0.15)


#plot
ep = list(x.keys())
reward_mc = list(x.values())
plt.plot(ep, reward_mc)
plt.title("20 Agents - On Policy - MC Control")
plt.ylabel("Undiscounted Return")     
plt.xlabel("Episode");