### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import pylab
import sys
import os

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [None]:
!pip install tensorflow

#### Defining Time Matrix

In [None]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy").astype("int")

In [None]:
Time_matrix[1][2]

#### Tracking the state-action pairs for checking convergence


In [None]:
#Dictionary to store Q-values predicted by model for each possible actions
Q_dict = collections.defaultdict(dict)



In [None]:
# Initialising states to be tracked
def initialise_tracking_states():
    sample_q_values = [((3,22,0),(5,1)),((1,14,6),(3,1)), ((3,6,2),(5,1)), ((3,6,2),(3,1))]

    for q_values in sample_q_values:
        state = q_values[0]
        action = q_values[1]
        States_track[state][action] = []

In [None]:
#Dictionary to track state-action pairs
States_track = collections.defaultdict(dict)

In [None]:
# Defining a function which will add new Q-values to the Q-dictionary. 
def add_to_dict(state, action, q_value):
    if(state in Q_dict.keys() and tuple(action) in Q_dict[state].keys()):
        Q_dict[state][tuple(action)].append(q_value)
    else:
        Q_dict[state][tuple(action)] = []
        Q_dict[state][tuple(action)].append(q_value)

In [None]:
#Save the Q-values from Q-Dictionary for tracking
def save_tracking_states():
    for state in States_track.keys():
        for action in States_track[state].keys():
            if state in Q_dict and action in Q_dict[state]:
                States_track[state][action].append(Q_dict[state][action]) 

In [None]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate =  0.001       
        self.epsilon_max = 1
        self.epsilon_decay = 0.0003
        self.epsilon_min = 0
        
        self.batch_size = 32        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()
        
        # initialize target model
        self.update_target_model()
    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets       
        
        # hidden layers
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        
        #model summary
        model.summary()
        return model

    #save the model graph as json
    def save_model_graph(self):
        # serialize model to JSON
        model_json = self.model.to_json()
        #with open("./save_model/cabdriver_dqn_model.json", "w") as json_file:
        with open("cabdriver_dqn_model.json", "w") as json_file:
            json_file.write(model_json)
    
    # after some time interval update the target model to be same with model
    def update_target_model(self):
        '''
        TODO:
        Update the target Q-value network to current Q-value network after training for a episode. This means that weights an
        biases of target Q-value network will become same as current Q-value network.
        '''
        self.target_model.set_weights(self.model.get_weights())



    def get_action(self, state):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment       
        ev = CabDriver()
        possible_action_index, possible_actions = ev.requests(state)
        
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment       
        if np.random.rand() <= self.epsilon_max:
            # explore: choose a random action from all possible actions
            # in case of cartpole this will randomly choose an action between 0 and 1
            return random.choice(possible_actions)
        else:
            #encode the state
            en_state = ev.state_encod_arch1(state)
            #reshape the state
            en_state = en_state.reshape(1, self.state_size)
            #Get the q-values for all the actions predicted by model
            q_value = self.model.predict(en_state)
            #Get the q-values for all the possible actions
            possible_action_q_value = [q_value[0][p_value] for p_value in (possible_action_index + [20])]
            #Get the max q-value index
            best_action_index = np.argmax(possible_action_q_value)
            #Get the possible actions based on max q-value
            possible_action_value = possible_actions[best_action_index]
            #add q-value to Q- Dictionary for checking the convergence
            add_to_dict(state, possible_action_value, np.max(possible_action_q_value))
            return possible_action_value 
        
        
        
        


    def append_sample(self, state, action, reward, next_state):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
        # Adding sample to the memory. 
        self.memory.append((state, action, reward, next_state))
    
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        
        ev = CabDriver()
        
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            #initialoze the update_output and update_input
            update_output = np.zeros((self.batch_size, self.state_size)) 
            update_input = np.zeros((self.batch_size, self.state_size)) 
            
            actions, rewards = [], []
            
            #store the state, action, reward and next_state values
            #from the mini batch in the separate variables
            for i in range(self.batch_size):
                state, action, reward, next_state = mini_batch[i]
                
                update_input[i] = ev.state_encod_arch1(state).reshape(1,self.state_size) #ev.state_encod_arch1(state).reshape(self.state_size)
                actions.append(action)
                rewards.append(reward)
                update_output[i] = ev.state_encod_arch1(next_state).reshape(1,self.state_size) #ev.state_encod_arch1(next_state).reshape(self.state_size)
            
            #convert it into int type
            update_input = update_input.astype('int')
            update_output = update_output.astype('int')
            
                      
            # 1. Predict the target from earlier model
            
            #initialize the target and target_qval
            target = np.zeros((self.batch_size, action_size))
            target_qval = np.zeros((self.batch_size, action_size))
            
            # predict the target q-values from states s
            for i in range(len(update_input)):
                target[i] = self.model.predict(update_input[i].reshape(1,self.state_size))
                
            # target for q-network
                target_qval[i] = self.target_model.predict(update_output[i].reshape(1,self.state_size))
                
            # update the target values
            for i in range(self.batch_size):
                target[i][(list(action_space) + [[0,0]]).index(actions[i])] = rewards[i] + self.discount_factor * (np.amax(target_qval[i]))
                
            # 4. Fit your model and track the loss values
            self.model.fit(update_input, target, batch_size=self.batch_size,epochs=1, verbose=0)




    def save(self, name):
        self.model.save(name)

In [None]:
Episodes = 1500
STEPS = 24 * 30
threshold = 30

In [None]:
# initialize tracking states
initialise_tracking_states()

### DQN block

In [None]:
# Call the environment
env = CabDriver()
action_space = env.reset()[0]
state_space = env.reset()[1]

# Call all the initialised variables of the environment
action_size = len(action_space)+1 #len(env.requests(curr_state)[1])
state_size = env.state_input[0].shape[0] + env.state_input[1].shape[0] + env.state_input[2].shape[0]

#Call the DQN agent
agent = DQNAgent(state_size=state_size , action_size=action_size)

scores, episodes, avg_rewards= [], [],[]
    
for episode in range(Episodes):

    curr_state=env.reset()[2]

    time_steps = 0
    reward = None
    score = 0
    
    while time_steps < STEPS:
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        # 2. Evaluate your reward and next state
        # 3. Append the experience to the memory
        # 4. Train the model by calling function agent.train_model
        # 5. Keep a track of rewards, Q-values, loss
        
        #Get the current action
        curr_action = agent.get_action(curr_state)
        
        #Get the next state
        next_state = env.next_state_func(curr_state, curr_action, Time_matrix)
        
        #Get the rewards
        reward = env.reward_func(curr_state, curr_action, Time_matrix)
        
        #Append the sample to memory
        agent.append_sample(curr_state, curr_action, reward, next_state)
        
        # every time step do the training
        agent.train_model()
        
        #check if current action is (0,0)
        #then increase the time step to 1
        #else increase the value given by time matrix
        if(curr_action[0] == 0 and curr_action[1] == 0):
            time_steps +=1
        else:
            time_steps += Time_matrix[curr_action[0]-1][curr_action[1]-1][curr_state[1]][curr_state[2]]
        
        #store the rewards
        score += reward
        
        #make the next state to current state
        curr_state = next_state
        
        if time_steps >= STEPS:
            
            # every episode update the target model to be same with model
            agent.update_target_model()
            
            #append the score and episode
            scores.append(score)
            episodes.append(episode)
            pylab.plot(episodes, scores, 'b')
            #pylab.savefig("./save_graph/cabdriver_dqn.png")
            pylab.savefig("cabdriver_dqn.png")
            print("episode:", episode, "  score:", score, "  memory length:",
                  len(agent.memory), "  epsilon:", agent.epsilon_max)
            
            #check if mean score of last 30 values
            if np.mean(scores[-min(30, len(scores)):]) > 2500:
                #agent.model.save_weights("./save_model/cabdriver_dqn.h5")
                agent.model.save_weights("cabdriver_dqn.h5")
                sys.exit()
    
    #Decay the epsilon
    if agent.epsilon_max > agent.epsilon_min:
        agent.epsilon_max = (agent.epsilon_min + (1 - agent.epsilon_min) * np.exp(-0.0009*episode))
                
    #save the model 
    if episode % 10 == 0:
        #agent.model.save_weights("./save_model/cabdriver_dqn.h5")
        agent.model.save_weights("cabdriver_dqn.h5")
        
    # make directory
    if not os.path.exists("saved_pickle_files"):
        os.mkdir("saved_pickle_files")

    # save rewards_per_episode
    save_obj(scores, "saved_pickle_files/scores")
    
    if ((episode+1)%threshold)==0:
        save_tracking_states()
        save_obj(States_track,'States_tracked')
    
    # plot results
    with open('saved_pickle_files/scores.pkl', 'rb') as f:
        scores = pickle.load(f)
    
    #plot episode and scores
    plt.plot(list(range(len(scores))), scores)
    plt.xlabel("episode number")
    plt.ylabel("scores")

    # save plots in saved_plots/ directory
    plt.savefig('rewards.png')
    
    #append the mean scores
    avg_rewards.append(np.mean(scores[-30:]))
    
    
    #print the average rewards
    #print("Average reward of last 30 episodes is {0}".format(np.mean(scores[-30:]))) 
    
        

In [None]:
time_steps

In [None]:
np.max(scores)

### Tracking Convergence

In [None]:
plt.figure(0, figsize=(16,7))

if (len(States_track[(3,6,2)][(3,1)]) != 0):
    xaxis = np.asarray(range(0, len(States_track[(3,6,2)][(3,1)] [0])))
    plt.subplot(241)
    plt.plot(xaxis,np.asarray(States_track[(3,6,2)][(3,1)][0]))
    plt.show

if (len(States_track[(3,22,0)][(5,1)]) != 0):
    xaxis = np.asarray(range(0, len(States_track[(3,22,0)][(5,1)] [0])))
    plt.subplot(242)
    plt.plot(xaxis,np.asarray(States_track[(3,22,0)][(5,1)][0]))
    plt.show

if (len(States_track[(3,6,2)][(5,1)]) != 0):
    xaxis = np.asarray(range(0, len(States_track[(3,6,2)][(5,1)] [0])))
    plt.subplot(243)
    plt.plot(xaxis,np.asarray(States_track[(3,6,2)][(5,1)][0]))
    plt.show

if (len(States_track[(1,14,6)][(3,1)]) != 0):
    xaxis = np.asarray(range(0, len(States_track[(1,14,6)][(3,1)] [0])))
    plt.subplot(244)
    plt.plot(xaxis,np.asarray(States_track[(1,14,6)][(3,1)][0]))
    plt.show

In [None]:
States_track

In [None]:
#plot episode and average rewards
plt.plot(list(range(len(avg_rewards))), avg_rewards)
plt.xlabel("episode number")
plt.ylabel("avg_rewards")
plt.show()

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()