# Cab-Driver Agent

### <a> 1) Importing Libraries

In [2]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import time
from tqdm import tqdm

# for building DQN model

from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.callbacks import TensorBoard
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
# from Env_v1 import CabDriver

### <a>2) Defining Time Matrix

In [4]:
# Loading the time matrix provided
# Time_matrix = np.load("TM.npy")
# Loading the time matrix (part of the sample code)
Time_matrix = np.load("../input/cab-driverdeep-rl/TM.npy")

In [5]:
Time_matrix[:2]

In [7]:
print("The shape of Time Matrix is:",Time_matrix.shape)
print("The max travel duration encountered in a journey:",Time_matrix.max())
print("The min travel duration observed:",Time_matrix.min())


<A> The maximum travel duration encountered during a journey is 11 hours(less than 24 hrs). Thus the next state of the cab driver may be updated by just 1 hour.

### Environment class

In [8]:
# Import routines

import numpy as np
import math
import random

import matplotlib.pyplot as plt
import pickle
from matplotlib import style
import time


# Defining hyperparameters
m = 5 # number of cities, ranges from 0 ..... m-1
t = 24 # number of hours, ranges from 0 .... t-1
d = 7  # number of days, ranges from 0 ... d-1
C = 5 # Per hour fuel and other costs
R = 9 # per hour revenue from a passenger



class CabDriver():

    def __init__(self):
        """initialise your state and define your action space and state space"""
        self.action_space = [(p,q) 
                             for p in range(m) for q in range(m) if p != q or p == 0]
        self.state_space = [(x,time,day) 
                            for x in range(m) for time in range(t) for day in range(d)]

        self.state_init = random.choice(self.state_space)
        self.action_init = random.choice(self.action_space)
        self.poisson_dist = [2, 12, 4, 7, 8]

        # Starting the first round
        self.reset()


    ## Encoding state (or state-action) for NN input

    def state_encod_arch1(self, state):
        """convert the state into a vector so that it can be fed to the NN. This method converts a given state into a vector format. Hint: The vector is of size m + t + d."""
        # Flatten the state as a (5+24+7) len vector with 3 hot values representing loc, time and day
        # Action will be the output of Q-NN
        # Total len of first layer input = 36
        state_encod = [0 for i in range(m + t + d)]
        state_encod[state[0]] = 1         # Location       
        state_encod[m + state[1]] = 1       # Time of the day
        state_encod[m+t + state[2]] = 1     # Day of the week
        return state_encod


    # Use this function if you are using architecture-2 
    def state_encod_arch2(self, state, action):
        """convert the (state-action) into a vector so that it can be fed to the NN. This method converts a given state-action pair into a vector format. Hint: The vector is of size m + t + d + m + m."""
        # Flatten the state & the action also in this architecture
        # state is flattened as a (m+t+d = 5+24+7) len vector with 3 hot values representing loc, time and day 
        # Action can be encoded as (m+m) len vector - first m for 'from' the 2nd for 'to'. None is activated for No Ride
        # Total len of first layer input = 5+24+7+5+5 = 46
        state_encod = [0 for i in range(m + t + d + m + m)]
        state_encod[state[0]] = 1         # Location       
        state_encod[m + state[1]] = 1       # Time of the day
        state_encod[m+t + state[2]] = 1     # Day of the week
        if action != (0,0):
            state_encod[m+t+d+ action[0]] = 1    #From
            state_encod[m+t+d+m+ action[1]] = 1  #To
            
        return state_encod


    ## Getting number of requests

    def requests(self, state):
        """Determining the number of requests basis the location. 
        Use the table specified in the MDP and complete for rest of the locations"""
        location = state[0]
        requests = np.random.poisson(self.poisson_dist[location-1])
            
        # Capping the requests to 15

        if requests >15:
            requests =15

        possible_actions_index = random.sample(range(1, (m-1)*m +1), requests) # (0,0) is not considered as customer request
        actions = [self.action_space[i] for i in possible_actions_index]
        
        # Let us append the no ride conditon
        actions.append([0,0])
        
        possible_actions_index.append(self.action_space.index((0,0)))

        return possible_actions_index,actions   



    def reward_func(self, state, action, Time_matrix):
        """Takes in state, action and Time-matrix and returns the reward"""
        next_state, wasted_time, travel_time, ride_time = self.next_state_func(state, action, Time_matrix)
        
        revenue_time = ride_time
        fuel_spent = travel_time + ride_time
        
        if (action[0] == 0 and action[1] == 0):
            reward = - C
        else:
            reward = R * (revenue_time) - C * (revenue_time + fuel_spent)
         
        return reward

    def updated_day_time(self, time, day):
        # correcting the time for 24 hour format
        if  time > 23:
            new_hour = time - 24
            new_day = day + 1
            if new_day > 6:
                new_day = new_day - 7
            else:
                new_day = new_day
        else:
            new_hour = time
            new_day = day
        return new_day, new_hour


    def next_state_func(self, state, action, Time_matrix):
#         self.episode_step += 1
        """Takes state and action as input and returns next state"""
        curr_loc  = state[0]
        curr_hour = state[1]
        curr_day  = state[2]
        pickup_loc= action[0]
        drop_loc  = action[1]
        
        # let us initialise time
        wasted_time = 0
        travel_time = 0
        ride_time = 0
        
        # Travelling time from current position to the pick up location
        ## 1) In case the driver rejects the ride
        if (pickup_loc) == 0 and (drop_loc == 0):  
            updated_loc  = curr_loc         # His location doesnt update as ride is denied
            wasted_time = int(1)               # He has to wait for a hour for next ride
            new_hour = curr_hour + wasted_time    
            
            updated_day, updated_hour = self.updated_day_time(new_hour, curr_day)
            
        ## 2) In case the ride is accepted
        else:
             # Using Time matrix
            
            travel_time = Time_matrix[curr_loc][pickup_loc][curr_hour][curr_day]
            new_hour = int(curr_hour + travel_time)  #converting into int

            updated_day, updated_hour = self.updated_day_time(new_hour, curr_day)
                
            updated_loc = drop_loc
            # Retrieving time taken for the updated parameters
            ride_time = Time_matrix[pickup_loc][updated_loc][updated_hour][updated_day]
            
        next_state  = [updated_loc, updated_hour,updated_day]
        return next_state, wasted_time, travel_time, ride_time
    

    
    def reset(self):
#         self.episode_step = 0
        return self.action_space, self.state_space, self.state_init


### <a>3)  Agent Class

In [None]:
# # Own Tensorboard class
# class ModifiedTensorBoard(TensorBoard):

#     # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
#     def __init__(self, **kwargs):
#         super().__init__(**kwargs)
#         self.step = 1
        
#         self.writer = tf.summary.create_file_writer(self.log_dir)

#     # Overriding this method to stop creating default log writer
#     def set_model(self, model):
#         pass

#     # Overrided, saves logs with our step number
#     # (otherwise every .fit() will start writing from 0th step)
#     def on_epoch_end(self, epoch, logs=None):
#         self.update_stats(**logs)

#     # Overrided
#     # We train for one batch only, no need to save anything at epoch end
#     def on_batch_end(self, batch, logs=None):
#         pass

#     # Overrided, so won't close writer
#     def on_train_end(self, _):
#         pass

#     # Custom method for saving own metrics
#     # Creates writer, writes custom metrics and closes writer
#     def update_stats(self, **stats):
#         self._write_logs(stats, self.step)

In [None]:
MODEL_NAME = '1st Architecture'

In [9]:
# Agent class
class DQNAgent:
    def __init__(self, state_size, action_size, env):
        # Define size of state and action
        self.state_size  = state_size
        self.action_size = action_size
        self.env = env
#         self.action_tracked = action_tracked
        

        # Write here: Specify you hyper parameters for the DQN
        self.DISCOUNT = 0.95
        self.LR  = 0.001  # Learning Rate
        
        # Exploration settings
        self.epsilon_max = 1
        self.epsilon = self.epsilon_max
        self.EPSILON_DECAY = 0.0009
        self.MIN_EPSILON = 0.00001


        # create replay memory using deque
        self.REPLAY_MEMORY_SIZE = 2_000
        self.replay_memory = deque(maxlen=2_000)  # How many last steps to keep for model training
#         self.MIN_REPLAY_MEMORY_SIZE = 100 # Minimum number of steps in a memory to start training
        self.MINIBATCH_SIZE = 32  # How many steps (samples) to use for training
        
#         self.MODEL_NAME = '1st Architecture'
#         self.tensorboard = ModifiedTensorBoard(log_dir=f"logs/{MODEL_NAME}-{int(time.time())}")
        
        # Initialize the value of the states tracked
        self.states_tracked = []
        
        # We are going to track state [1,1,1]
        self.track_state = np.array(env.state_encod_arch1([1,1,1])).reshape(1, 36)

        # create main model and target model
        self.model = self.build_model()
    
        

    # approximate Q function using Neural Network
    def build_model(self):
        '''
        TODO:
        Build multilayer perceptron to train the Q(s,a) function. In this neural network, the input will be states and the output 
        will be Q(s,a) for each (state,action). 
        Note: Since the ouput Q(s,a) is not restricted from 0 to 1, we use 'linear activation' as output layer.

        Loss Function:
        Loss=1/2 * (R_t + γ∗max Q_t (S_{t+1},a)−Q_t(S_t,a)^2
               which is 'mean squared error'

        '''
        model = Sequential()            
        # Write your code here: Add layers to your neural nets   
        model.add(Dense(32, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        ...
        model.add(Dense(32,activation='relu',
                        kernel_initializer='he_uniform'))
        ...
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))     # action_size = how many choices (21)
        ...
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.LR))
        model.summary
        return model

    
    def get_action(self, state,poss_rides_index, action_space):
        '''
        Select action
        Args:
            state: At any given state, choose action
        
        TODO:
        Choose action according to ε-greedy policy. We generate a random number over [0, 1) from uniform distribution.
        If the generated number is less than ε, we will explore, otherwise we will exploit the policy by choosing the
        action which has maximum Q-value.
        
        More the ε value, more will be exploration and less exploitation.
        
        '''
           
        
        if np.random.rand() <= self.epsilon:
            #randomly choosing an action from feasible "ride requests"
            
            rand_index = random.randrange(len(poss_rides_index))
            action = action_space[rand_index]
            return rand_index, action

        else:
            # lets convert the encoded state into an array and reshape it in the correct format-(batch_size,input_shape)
            state_vector = np.array(env.state_encod_arch1(state)).reshape(1, self.state_size)
            q_val = self.model.predict(state_vector)
            max_index = np.argmax(q_val[0])
            action = action_space[max_index]
            return max_index, action
    
    
    def update_replay_memory(self, state, action, reward, next_state, done):
        # save sample <s,a,r,s'> to the replay memory

        # Adding sample to the memory. 
        self.replay_memory.append((state, action, reward, next_state, done))
        
#                 # Decay in ε after we generate each sample from the environment
#         if self.epsilon > self.MIN_EPSILON:
#             self.epsilon *= self.EPSILON_DECAY
        
    

    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        '''
        Train the neural network to find the best policy
        
        TODO:
        1. Sample <s,a,r,s',done> of batch size from the memory
        2. Set the target as R_t + γ∗max Q_t(S_{t+1},a)−Q_t(S_t,a)
        3. We already have the actions that we took when generating sample from environment
        4. To find the Q_t(S_t,a), we input the current state s to the model, and we get Q-value for all the actions
        5. To find the Q_t(S_{t+1},a), we input the next state s' to the model, and we get Q-value for all the actions
        6. Train the model        
        
        '''
        if len(self.replay_memory) > self.MINIBATCH_SIZE:
            # Sample batch from the memory
            mini_batch = random.sample(self.replay_memory, self.MINIBATCH_SIZE)
            # initialise two matrices - update_input and update_output
            update_input = np.zeros((self.MINIBATCH_SIZE, self.state_size))
            update_output = np.zeros((self.MINIBATCH_SIZE, self.state_size))
            actions, rewards, done = [], [], []

            # populate update_input and update_output and the lists rewards, actions, done
            for i in range(self.MINIBATCH_SIZE):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                update_input[i] = env.state_encod_arch1(state)     
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)

            # predict the target q-values from states s
            current_qval = self.model.predict(update_input)
            # target for q-network
            future_qval = self.model.predict(update_output)


            # update the target values
            for i in range(self.MINIBATCH_SIZE):
                if done[i]:
                    current_qval[i][actions[i]] = rewards[i]
                else: # non-terminal state
                    current_qval[i][actions[i]] = rewards[i] + self.DISCOUNT * np.max(future_qval[i])
                    
            self.model.fit(update_input, current_qval, batch_size=self.MINIBATCH_SIZE)


    def save(self):
        self.model.save_weights("weights.h5")
        self.model.save("my_model.pkl")
        
        
                
    def save_qval_for_tracked_state(self):
        # Use the model to predict the q_value of the state we are tacking.
        q_value = self.model.predict(self.track_state)
        
        # Let us track the q_value for some randomly selected "action index"
        # Let us select action index 5 i.e (1,0)
        self.states_tracked.append(q_value[0][5])


### <a> 4) DQN block

In [12]:

Episodes = 3000
episode_length = 720  # car discharges after this period

start_time = time.time()

# Accumulating rewards from every epsiode and keeping track of episodes 
rewards_per_episode, episodes, avg_rewards_per_episode = [], [], []

# Invoke Env class
env = CabDriver()

# get size of state and action from environment
state_size = len(env.state_encod_arch1(env.state_init))
action_size = len(env.action_space)

#Call the DQN agent
agent = DQNAgent(state_size, action_size,env)
action_space = env.action_space

In [None]:


# for episode in tqdm(range(1,Episodes +1), ascii = True, unit = 'episode'):
for episode in range(Episodes +1):
    # Update tensorboard step every episode
#     agent.tensorboard.step = episode

    # Invoke Env class
    env = CabDriver()
    
    # Restarting episode - reset episode reward and step number
    reward_per_timestep = 0
    time_step = 0

    # Reset environment and get initial state
    action_space, state_space, state = env.reset()
#     current_state = env.reset()
    total_reward = 0
    
    # Rnadomly Initialise a state & an action for tracking q-value 
    state_to_be_tracked = env.state_init
#     action_tracked = env.action_init

    # Reset flag and start iterating until episode ends
    done = False
    total_journey = 0
    
    while not done:
        time_step +=1
        # 1. Get a possible list of the ride requests driver gets
        possible_actions_index,actions = env.requests(state)
        ...
        # 2. Selectig an action from Epsilon greedy policy
        input_state = env.state_encod_arch1(state)
#         action_space = env.action_space
        action_index, action = agent.get_action(input_state, possible_actions_index, action_space)
        ...
        # 3. Evaluating the next state 
        next_state, wasted_time, travel_time, ride_time = env.next_state_func(state, action,
                                                                             Time_matrix)
        # 4. Evaluating reward for every time step
        reward = env.reward_func(state, action, Time_matrix)   
        ...
         # 5.Calculating total journey time in the episode
        total_journey += wasted_time + travel_time + ride_time
        
        if total_journey >= episode_length:
            print("Episode Terminated")
            done = True         
        
        # 5. Every step we update replay memory 
        agent.update_replay_memory(env.state_encod_arch1(state), action_index,
                            reward, env.state_encod_arch1(next_state), done)
#         # 5. Proceeding ahead only if the episode hasnt ended
#         if done == False:

        # 7. Train main network
        if time_step % 20 == 0:
                agent.train_model()
                
         # 8. Tracking reward & updating the state
        reward_per_timestep += reward
        current_state = next_state            
        
        # 9. check for terminal state
        if done == True:
            print("Episode Terminated")
            terminal_state = True
            
    # 10. Evaluating total reward obtained in this entire episode
    rewards_per_episode.append(reward_per_timestep)
    episodes.append(episode)
    
    # 11. epsilon decay
    if agent.epsilon > agent.MIN_EPSILON:
        agent.epsilon = agent.MIN_EPSILON + (agent.epsilon_max - agent.MIN_EPSILON) * np.exp(-agent.EPSILON_DECAY*episode)
    
    # 10. TRACKING REWARDS
    # every 10 episodes:
#     if ((episode) % 20 == 0):
    print("episode: {0}, reward: {1}, memory_length: {2}, epsilon: {3} ".format(episode,
                                                                         reward_per_timestep,
                                                                         len(agent.replay_memory),
                                                                         agent.epsilon))
        
    # Save the Q_value of the state, action pair we are tracking
    if ((episode) % 10 == 0):
        agent.save_qval_for_tracked_state()

#     # Total rewards per episode
#     reward_tracked.append(reward)
    
# print("Saving Model and weights {}".format(episode))
# agent.save()
    
    if(episode % 1000 == 0):
        print("Saving Model and weights {}".format(episode))
        agent.save()
        

In [None]:
elapsed_time = time.time() - start_time
print('Total time taken ',elapsed_time)

In [None]:
save_qval_for_tracked_state

### Tracking Convergence <br>

There are two ways to check the convergence of the DQN model:

 - Sample a few state-action pairs and plot their Q-values along episodes

 - Check whether the total rewards earned per episode are showing stability

In [None]:
# Check Convergence by tracking total rewards per episode vs episode number
plt.plot(list(range(len(rewards_per_episode))), rewards_per_episode)
plt.ylabel("Total rewards")
plt.title('Rewards per episode')
plt.show()

In [None]:
# Let us track Average reward per 50 episode
avg_rewards = []
episodes = len(rewards_per_episode)
index = 0
track_total_reward = 0
for episode_number in range(episodes):
    if index != 50:
        track_total_reward += rewards_per_episode[episode_number]
        index += 1
    else:
        avg_rewards.append(track_total_reward/index)
        track_total_reward = rewards_per_episode[episode_number]
        index = 1

avg_rewards.append(track_total_reward/index)
        
    
print(avg_rewards)

In [None]:
## Check Convergence by tracking average rewards per episode vs episode number
plt.plot(list(range(len(avg_rewards))), avg_rewards)
plt.ylabel("Average rewards")
plt.title('Average Rewards vs Episode')
plt.show()

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Q_value for state [1,1,1]  action (1,0)')
xaxis = np.asarray(range(0, len(agent.states_tracked)))
plt.semilogy(xaxis,np.asarray(agent.states_tracked))
plt.show()

In [None]:

# state_tracked_sample = [agent.states_tracked[i] for i in range(len(agent.states_tracked)) if agent.states_tracked[i] < 1000]

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()

In [None]:
# Runnable example
sequential_model = keras.Sequential(
    [
        keras.Input(shape=(784,), name="digits"),
        keras.layers.Dense(64, activation="relu", name="dense_1"),
        keras.layers.Dense(64, activation="relu", name="dense_2"),
        keras.layers.Dense(10, name="predictions"),
    ]
)
sequential_model.save_weights("weights.h5")
sequential_model.load_weights("weights.h5")

In [None]:
# Calling `save('my_model.h5')` creates a h5 file `my_model.h5`.
model.save("my_h5_model.h5")

# It can be used to reconstruct the model identically.
reconstructed_model = keras.models.load_model("my_h5_model.h5")


In [None]:
# Calling `save('my_model')` creates a SavedModel folder `my_model`.
model.save("my_model")

# It can be used to reconstruct the model identically.
reconstructed_model = keras.models.load_model("my_model")