### Cab-Driver Agent

In [1]:
# XLA service initialization - CUDA
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

In [2]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import tensorflow as tf
import time

# for building DQN model
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

In [3]:
print(tf.__version__)

2.4.1


In [4]:
print("GPUs: ", len(tf.config.experimental.list_physical_devices('GPU')))

GPUs:  1


#### Defining Time Matrix

In [7]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy", allow_pickle=True)

#### Check what the max, min and mean time values are. This will help us in defining the 'next_step' function in the Environment.

In [8]:
print(type(Time_matrix))
print(Time_matrix.max())
print(Time_matrix.min())
print(Time_matrix.mean())
print(Time_matrix.var())

<class 'numpy.ndarray'>
11.0
0.0
3.0542857142857143
7.93705306122449


#### Since the max time is 11 hours between any 2 points, the next state of the cab driver may increase at most by 1 day.

#### Tracking the state-action pairs for checking convergence


In [9]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [10]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate = 0.01
        self.epsilon = 1
        self.epsilon_max = 1
        self.epsilon_decay = -0.0005 #for 15k
        #self.epsilon_decay = -0.00015 #for 20k
        self.epsilon_min = 0.00001
        
        self.batch_size = 32

        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # Initialize the value of the states tracked
        self.states_tracked = []
        
        # We are going to track state [0,0,0] and action (0,2) at index 2 in the action space.
        self.track_state = np.array(env.state_encod_arch1([0,0,0])).reshape(1, 36)

        # create main model and target model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    def build_model(self):
        """
        Function that takes in the agent and constructs the network
        to train it
        @return model
        @params agent
        """
        input_shape = self.state_size
        model = Sequential()
        # Write your code here: Add layers to your neural nets       
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model

    def get_action(self, state, possible_actions_index, actions):
        """
        get action in a state according to an epsilon-greedy approach
        possible_actions_index, actions are the 'ride requests' that teh driver got.
        """        
        # get action from model using epsilon-greedy policy
        # Decay in ε after each episode       
        if np.random.rand() <= self.epsilon:
            # explore: choose a random action from the ride requests
            return random.choice(possible_actions_index)
        else:
            # choose the action with the highest q(s, a)
            # the first index corresponds to the batch size, so
            # reshape state to (1, state_size) so that the first index corresponds to the batch size
            state = np.array(env.state_encod_arch1(state)).reshape(1, 36)

            # Use the model to predict the Q_values.
            q_value = self.model.predict(state)

            # truncate the array to only those actions that are part of the ride  requests.
            q_vals_possible = [q_value[0][i] for i in possible_actions_index]

            return possible_actions_index[np.argmax(q_vals_possible)]

    def append_sample(self, state, action_index, reward, next_state, done):
        """appends the new agent run output to replay buffer"""
        self.memory.append((state, action_index, reward, next_state, done))
        
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        """ 
        Function to train the model on eacg step run.
        Picks the random memory events according to batch size and 
        runs it through the network to train it.
        """
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            # initialise two matrices - update_input and update_output
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            actions, rewards, done = [], [], []

            # populate update_input and update_output and the lists rewards, actions, done
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                update_input[i] = env.state_encod_arch1(state)     
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)

            # predict the target q-values from states s
            target = self.model.predict(update_input)
            # target for q-network
            target_qval = self.model.predict(update_output)


            # update the target values
            for i in range(self.batch_size):
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                else: # non-terminal state
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])
            # model fit
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
            
    def save_tracking_states(self):
        # Use the model to predict the q_value of the state we are tacking.
        q_value = self.model.predict(self.track_state)
        
        # Grab the q_value of the action index that we are tracking.
        self.states_tracked.append(q_value[0][2])
        
    def save_test_states(self):
        # Use the model to predict the q_value of the state we are tacking.
        q_value = self.model.predict(self.track_state)
        
        # Grab the q_value of the action index that we are tracking.
        self.states_test.append(q_value[0][2])

    def save(self, name):
        self.model.save(name)


### DQN block

for episode in range(Episodes):

    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    

    #Call the DQN agent
    
    
    while !terminal_state:
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        # 2. Evaluate your reward and next state
        # 3. Append the experience to the memory
        # 4. Train the model by calling function agent.train_model
        # 5. Keep a track of rewards, Q-values, loss
        

In [11]:
episode_time = 24*30 #30 days before which car has to be recharged
n_episodes = 15000
m = 5
t = 24
d = 7

# Invoke Env class
env = CabDriver()
action_space, state_space, state = env.reset()

# Set up state and action sizes.
state_size = m+t+d
action_size = len(action_space)

# Invoke agent class
agent = DQNAgent(action_size=action_size, state_size=state_size)

# to store rewards in each episode
rewards_per_episode, episodes = [], []
# Rewards for state [0,0,0] being tracked.
rewards_init_state = []

#### Run the episodes, build up replay buffer and train the model.
Note:
The moment total episode time exceeds 720 (30 days), we ignore the most recent ride and do NOT save that experience in the replay memory
The init state is randomly picked from the state space for each episode

In [None]:
start_time = time.time()
score_tracked = []

for episode in range(n_episodes):

    done = False
    score = 0
    track_reward = False

    # reset at the start of each episode
    env = CabDriver()
    action_space, state_space, state = env.reset()
    # Save the initial state so that reward can be tracked if initial state is [0,0,0]
    initial_state = env.state_init


    total_time = 0  # Total time driver rode in this episode
    while not done:
        # 1. Get a list of the ride requests driver got.
        possible_actions_indices, actions = env.requests(state)
        # 2. Pick epsilon-greedy action from possible actions for the current state.
        action = agent.get_action(state, possible_actions_indices, actions)

        # 3. Evaluate your reward and next state
        reward, next_state, step_time = env.step(state, env.action_space[action], Time_matrix)
        # 4. Total time driver rode in this episode
        total_time += step_time
        if (total_time > episode_time):
            # if ride does not complete in stipu;ated time skip
            # it and move to next episode.
            done = True
        else:
            # 5. Append the experience to the memory
            agent.append_sample(state, action, reward, next_state, done)
            # 6. Train the model by calling function agent.train_model
            agent.train_model()
            # 7. Keep a track of rewards, Q-values, loss
            score += reward
            state = next_state

    # store total reward obtained in this episode
    rewards_per_episode.append(score)
    episodes.append(episode)
    

    # epsilon decay
    agent.epsilon = (1 - 0.00001) * np.exp(agent.epsilon_decay * episode)

    # every 10 episodes:
    if ((episode + 1) % 10 == 0):
        print("episode {0}, reward {1}, memory_length {2}, epsilon {3} total_time {4}".format(episode,
                                                                         score,
                                                                         len(agent.memory),
                                                                         agent.epsilon, total_time))
    # Save the Q_value of the state, action pair we are tracking
    if ((episode + 1) % 5 == 0):
        agent.save_tracking_states()

    # Total rewards per episode
    score_tracked.append(score)

    if(episode % 1000 == 0):
        print("Saving Model {}".format(episode))
        agent.save(name="model_weights.h5")
    
elapsed_time = time.time() - start_time
print(elapsed_time)

Saving Model 0
episode 9, reward -175.0, memory_length 1369, epsilon 0.9955001547284723 total_time 734.0
episode 19, reward 191.0, memory_length 2000, epsilon 0.9905350769930761 total_time 727.0
episode 29, reward -216.0, memory_length 2000, epsilon 0.9855947626861951 total_time 722.0
episode 39, reward -243.0, memory_length 2000, epsilon 0.9806790882997144 total_time 721.0
episode 49, reward -198.0, memory_length 2000, epsilon 0.9757879309415182 total_time 726.0
episode 59, reward -292.0, memory_length 2000, epsilon 0.9709211683324178 total_time 724.0
episode 69, reward 99.0, memory_length 2000, epsilon 0.9660786788030947 total_time 729.0
episode 79, reward -165.0, memory_length 2000, epsilon 0.9612603412910584 total_time 724.0
episode 89, reward -337.0, memory_length 2000, epsilon 0.9564660353376199 total_time 725.0
episode 99, reward -63.0, memory_length 2000, epsilon 0.9516956410848808 total_time 732.0
episode 109, reward -206.0, memory_length 2000, epsilon 0.9469490392727365 total

episode 909, reward 216.0, memory_length 2000, epsilon 0.6347589235987051 total_time 725.0
episode 919, reward 314.0, memory_length 2000, epsilon 0.631593050259626 total_time 721.0
episode 929, reward 170.0, memory_length 2000, epsilon 0.6284429667796988 total_time 721.0
episode 939, reward 374.0, memory_length 2000, epsilon 0.6253085944066726 total_time 727.0
episode 949, reward 701.0, memory_length 2000, epsilon 0.6221898547810748 total_time 722.0
episode 959, reward 646.0, memory_length 2000, epsilon 0.6190866699342522 total_time 726.0
episode 969, reward 541.0, memory_length 2000, epsilon 0.6159989622864221 total_time 722.0
episode 979, reward 321.0, memory_length 2000, epsilon 0.6129266546447325 total_time 726.0
episode 989, reward 335.0, memory_length 2000, epsilon 0.6098696702013323 total_time 722.0
episode 999, reward 474.0, memory_length 2000, epsilon 0.6068279325314512 total_time 726.0
Saving Model 1000
episode 1009, reward 339.0, memory_length 2000, epsilon 0.603801365591488

episode 1799, reward 867.0, memory_length 2000, epsilon 0.4067689276701942 total_time 729.0
episode 1809, reward 941.0, memory_length 2000, epsilon 0.40474015917966877 total_time 726.0
episode 1819, reward 946.0, memory_length 2000, epsilon 0.4027215092142031 total_time 731.0
episode 1829, reward 1086.0, memory_length 2000, epsilon 0.4007129273074429 total_time 729.0
episode 1839, reward 1081.0, memory_length 2000, epsilon 0.39871436324473586 total_time 721.0
episode 1849, reward 911.0, memory_length 2000, epsilon 0.3967257670618763 total_time 721.0
episode 1859, reward 847.0, memory_length 2000, epsilon 0.3947470890438561 total_time 732.0
episode 1869, reward 419.0, memory_length 2000, epsilon 0.3927782797236218 total_time 722.0
episode 1879, reward 956.0, memory_length 2000, epsilon 0.3908192898808378 total_time 721.0
episode 1889, reward 707.0, memory_length 2000, epsilon 0.388870070540656 total_time 726.0
episode 1899, reward 971.0, memory_length 2000, epsilon 0.38693057297249134 t

episode 2689, reward 1196.0, memory_length 2000, epsilon 0.26066740358669477 total_time 721.0
episode 2699, reward 873.0, memory_length 2000, epsilon 0.25936731948751673 total_time 721.0
episode 2709, reward 1140.0, memory_length 2000, epsilon 0.2580737195848345 total_time 723.0
episode 2719, reward 1126.0, memory_length 2000, epsilon 0.25678657153858325 total_time 724.0
episode 2729, reward 956.0, memory_length 2000, epsilon 0.2555058431699948 total_time 722.0
episode 2739, reward 1381.0, memory_length 2000, epsilon 0.25423150246079323 total_time 727.0
episode 2749, reward 1055.0, memory_length 2000, epsilon 0.2529635175523944 total_time 721.0
episode 2759, reward 826.0, memory_length 2000, epsilon 0.25170185674510953 total_time 725.0
episode 2769, reward 938.0, memory_length 2000, epsilon 0.25044648849735274 total_time 727.0
episode 2779, reward 1406.0, memory_length 2000, epsilon 0.2491973814248526 total_time 724.0
episode 2789, reward 1449.0, memory_length 2000, epsilon 0.247954504

In [None]:
agent.save(name="model_weights.h5")

### Tracking Convergence

In [None]:
agent.states_tracked

In [None]:
state_tracked_sample = [agent.states_tracked[i] for i in range(len(agent.states_tracked)) if agent.states_tracked[i] < 1000]

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Q_value for state [0,0,0]  action (0,2)')
xaxis = np.asarray(range(0, len(agent.states_tracked)))
plt.semilogy(xaxis,np.asarray(agent.states_tracked))
plt.show()

In [None]:
score_tracked_sample = [score_tracked[i] for i in range(len(score_tracked)) if (i % 4 == 0)]

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Rewards per episode')
xaxis = np.asarray(range(0, len(score_tracked_sample)))
plt.plot(xaxis,np.asarray(score_tracked_sample))
plt.show()

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()