## Tabular Dyna-Q

In [None]:
import numpy as np
import random
from racetrack_env import RacetrackEnv
import matplotlib.pyplot as plt

class Agent:
    def __init__(self, env, alpha, gamma, epsilon, steps, n):
        # parameters
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.steps = steps
        self.n = n
        
        #non-parameters
        self.action_space = len(env.get_actions())
        self.state_space = []
    
            
    def initialise(self):
        
        #state : list of probability of taking action == index
        policy = {}
        #state : list of Q values by action == index
        Q = {}
        #state: tuple (reward, next_state)
        model = {}
        #state: visited
        visited = {}
        
        #episode: sum reward of episode
        reward_dict = {}
        for itr in range(self.steps):
            reward_dict[itr]=0
    
        
        
        x , y = len(self.env.track), len(self.env.track[0])
        vel = range(-10,11)
        
        # arbitrary initialisation of main variables 
        for x in range(x+1):
            for y in range(y+1): 
                for x_vel in vel:
                    for y_vel in vel:
                        state = (x,y,x_vel,y_vel)
                        
                        policy[state] = [1/self.action_space] * self.action_space
                        Q[state]      = [0] * self.action_space
                        model[state]  = [0] * self.action_space
                        visited[state]= [0] * self.action_space 
                            
                            
        return policy, Q, reward_dict, model, visited


    
    def dynaQ(self):
        
        #initalise policy, Q-table, rewards, and model
        policy, Q, reward_dict, model, visited = self.initialise()
        
        state_v = []

        
        #Loop for steps
        for itr in range(self.steps):
            
            #initialise S
            state = self.env.reset()
            
            #list to sum rewards per episode
            episode_reward = []
            
            #Loop for each step of episode
            while True:
        
                #choose epsilon-greedy action from Q(s)
                action = Q[state].index(max(Q[state]))
                
                #explore
                if np.random.uniform() < self.epsilon:
                    action = random.randint(0, self.action_space-1)
                    
                
                
                
                #take action A, observe R, S'
                next_state, reward, terminal = self.env.step(action)

                #append reward per episode to list for plot
                episode_reward.append(reward)
                
                #update Q table
                Q[state][action] = Q[state][action] + (self.alpha*((reward + self.gamma* max(Q[next_state])) - (Q[state][action])))

                #update model with reward and next_state
                model[state][action] = (reward, next_state)
                
                #store visited states and actions taken from state
                state_v.append(state)
                visited[state][action] = 1
                
                #Loop N times
                for i in range(self.n):
                    
                    #random previously observed state
                    s = random.choice(state_v)
                    
                    #random action taken in given state
                    a = random.choice([indx for indx, num in enumerate(visited[s]) if num==1])
                    
                    #get reward and next state from model
                    r, n_state = model[s][a]
                    
                    #update Q-table
                    Q[s][a] = Q[s][a] + (self.alpha*((r + self.gamma* max(Q[n_state])) - (Q[s][a])))

                    
                #update next state
                state = next_state
                
                #break if terminal state
                if terminal:
                    break
                    
                
                #undiscounted return per episode
                reward_dict[itr] = sum(episode_reward)
 
        return reward_dict

    
    
            
            
def agent_avrg(agents, steps, alpha, gamma, epsilon, n):
       
    #initialise average undiscounted return among x agents
    avrg_undiscounted_return = {}
    for itr in range(steps):
        avrg_undiscounted_return[itr]=0
        
    env = RacetrackEnv()
    
    for i in range(agents):
        env.reset()
        
        #alpha, gamma, epsilon, steps, n
        a = Agent(env, alpha, gamma, epsilon, steps, n)
        undiscounted_return = a.dynaQ()
        
        for ep in avrg_undiscounted_return:
            avrg_undiscounted_return[ep]+= undiscounted_return[ep]
    
    for ep in avrg_undiscounted_return:
        avrg_undiscounted_return[ep] = avrg_undiscounted_return[ep]/agents
                    
    return avrg_undiscounted_return



#training
#agents, steps, alpha, gamma, epsilon, n
x  = agent_avrg(20, 150, 0.2, 0.9, 0.15, 50)


#plot
ep = list(x.keys())
reward_dynaQ = list(x.values())
plt.plot(ep, reward_dynaQ)
plt.title("20 Agents - Dyna-Q (n=50)")
plt.ylabel("Undiscounted Return")     
plt.xlabel("Episode");