In [None]:
# import libraries
import warnings
import numpy as np

import time
import matplotlib.pyplot as plt

# Let's do a relative import
import sys
import os

SCRIPT_DIR = os.path.dirname(os.path.abspath('../src_JokkeRuokolainen/*/'))
sys.path.append(os.path.dirname(SCRIPT_DIR))

from src_JokkeRuokolainen.environment_JokkeRuokolainen import CabDriver
from src_JokkeRuokolainen.agent_JokkeRuokolainen import DuelingQAgent
np.set_printoptions(threshold=sys.maxsize)
warnings.filterwarnings("ignore")


In [None]:
import tensorflow as tf
tf.config.list_physical_devices()

### Define taxi pickup time matrix

In [None]:
# Loading the time matrix provided
Time_matrix = np.load("../Data/Inputs/TM.npy")
print(Time_matrix.shape)
print(Time_matrix[3][4][17][5])
# Example: (Returns Time Taken) 𝑇𝑖𝑚𝑒−𝑚𝑎𝑡𝑟𝑖𝑥[𝑠𝑡𝑎𝑟𝑡−𝑙𝑜𝑐][𝑒𝑛𝑑−𝑙𝑜𝑐][ℎ𝑜𝑢𝑟−𝑜𝑓−𝑡ℎ𝑒−𝑑𝑎𝑦] [𝑑𝑎𝑦−𝑜𝑓−𝑡ℎ𝑒−𝑤𝑒𝑒𝑘]


### Initialize environment

In [None]:
# Creating tuples of action_index and actions from action_space
env = CabDriver()
cab_action_space = env.action_space
cab_action_indices = [i for i in range(len(cab_action_space))]
action_list = [
    i for i in zip(cab_action_indices, cab_action_space)
]  # tuples (action_indices, action)
print("Action List (action_indices, action):")
action_list


In [None]:
# Understanding the min and max time duration of trips from one point to another

print("Minimimum time taken:", Time_matrix.min())
print("Maximum time taken:", Time_matrix.max())
print("Average time taken:", Time_matrix.mean())


The maximum time taken is 11 hours. 

So, it is safe to say that the maximum time taken by the cab driver to move from one point to another is less than 1 day.

### Initialize tracking of state-action pairs during training

In [None]:
import collections
states_track = collections.defaultdict(dict)


def initialize_tracking_states():
    sample_q_values = [
        ((3, 0, 2), (3, 1)),
        ((1, 6, 3), (2, 3)),
        ((2, 2, 2), (3, 2)),
        ((3, 10, 6), (3, 4)),
        ((0, 20, 3), (1, 4)),
        ((1, 23, 3), (1, 4)),
    ]
    for state, action in sample_q_values:
        states_track[state][action] = []


initialize_tracking_states()


In [None]:
# Check Random State

print("Random State Initialization:")
for i in range(5):  # Checking for 5 episodes
    env = CabDriver()
    random_state_init = env.state_init
    print(random_state_init)


In [None]:
# Understanding action_size
action_size = len(env.action_space)
print("action_size:", action_size)


In [None]:
def save_tracking_states():
    state_encod = np.array([agent.convert_state_to_vector(state)
                           for state in states_track.keys()])
    state_encod = np.reshape(state_encod, [-1, agent.state_size])
    predictions = agent.model.predict(state_encod, verbose=0)

    for (state, actions), prediction in zip(states_track.items(), predictions):
        for action in actions:
            action_index = env.action_space.index(list(action))
            Q = prediction[action_index]
            states_track[state][action].append(Q)


In [None]:
import pickle
# Defining a function to save the Q-dictionary as a pickle file


def save_obj(obj, name):
    with open(name + ".pkl", "wb") as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


### Reset environment and initialize DDQN Agent class

In [None]:
episode_time = 24 * 30  # 24 hrs for 30 days per episode
Episodes = 200  # No. of Episodes
m = 5  # No. Locations
t = 24  # No. of hrs in a day
d = 7  # No. of days in a week
# Reset environment
action_space, state_space, state = env.reset()

# Set up state and action sizes.
state_size = m + t + d  # Network uses state as input
action_size = len(action_space)

# Invoke agent class
agent = DuelingQAgent(action_size=action_size, state_size=state_size)

# to store rewards in each episode
rewards_per_episode, episodes = [], []
# Rewards for state
rewards_init_state = []


### Train model

In [None]:
start_time = time.time()
# tracking average reward per episode = total rewards in an episode/ total steps in an episode
avg_reward = []
# tracking total rewards per episode
total_reward = []
terminal_state = False
env = CabDriver()
for episode in range(0, Episodes):
    # tracking total rewards, step count
    tot_reward = 0
    step_count = 0
    total_time = 0  # Total time driver rode in this episode
    terminal_state = False

    # Reset at the start of each episode
    action_space, state_space, state = env.reset()
    # State initialization
    initial_state = state

    while not terminal_state:
        # 1. Pick epsilon-greedy action from possible actions for the current state.
        possible_actions_index, actions = env.requests(state)
        action = agent.get_action(state, possible_actions_index)
        if agent.epsilon > agent.epsilon_min:
            agent.epsilon = agent.exponential_decay(agent.epsilon_max, agent.epsilon_decay, step_count)
        # 2. Evaluate your reward and next state
        next_state, reward, step_time = env.step(
            state, env.action_space[action], Time_matrix
        )
        # 3. Total time driver rode in this episode
        total_time += step_time
        if total_time > episode_time:
            # The cab driver accepts the last ride prior to the end of total time limit (720 hours).
            # So the last trip begins before the end of 720 hrs but the total time of episode might cross 720 hrs.
            terminal_state = True
        else:
            # 4. Append the experience to the memory
            # calculate the temporal difference error
            agent.append_sample(state, action, reward,
                                next_state, terminal_state)
            # Note: Here action is action index
            # 5. Train the model by calling function agent.train_model
            agent.train_model()
            # 6. Update current state
            state = next_state
            tot_reward += reward
            step_count += 1

        # 7. Keep a track of rewards, Q-values, loss, etc.
        # (Note: Loss were tracking is the model loss='mse')
        if terminal_state and episode % 25 == 0:
            avg_reward.append(tot_reward / step_count)
            total_reward.append(tot_reward)
            print(
                "episode:",
                episode,
                "  score:",
                tot_reward,
                "  memory length:",
                len(agent.memory),
                "  epsilon:",
                agent.epsilon,
            )

    # Store 'agent_model' every 200th episode
    if episode % 200 == 0:
        agent.save(f"../Data/Outputs/chatGPT_dev/cab_driver.h5")

    # Every 25th episode
    if episode % 25 == 0:
        save_obj(avg_reward, "../Data/Outputs/chatGPT_dev/Rewards")
        save_tracking_states()
        save_obj(states_track, "../Data/Outputs/chatGPT_dev/States_tracked")
    # Every 10000th episodes
    if episode % 10000 == 0 and episode != 0:
        plt.plot(list(range(len(avg_reward))), avg_reward)
        plt.show()
    # Saving the 'DQN_model' and 'model_weights' every 1000th episode.
    if episode % 1000 == 0:
        print("Saving Model {}".format(episode))
        # Saves DQN model in Keras H5 format
        agent.save(name="../Data/Outputs/chatGPT_dev/DQN_model.h5")
        print("Saving Model {} Weights".format(episode))
        agent.save_weights_numpy(
            name="../Data/Outputs/chatGPT_dev/model_weights.pkl"
        )  # Saves model_weights in pkl file
        # (model_weights pickle file has a list of numpy arrays)

elapsed_time = time.time() - start_time
print(f"Elapsed time in sec: {elapsed_time}")


### Plot convergence and tracked states

In [None]:
# Plotting average rewards
# x-values = 20000 episodes tracked after every 25th episode
plt.plot(list(range(len(avg_reward))), avg_reward)
plt.ylabel("Average Reward")
plt.show()


In [None]:
# Plotting total rewards
plt.plot(list(range(len(total_reward))), total_reward)
plt.ylabel("Total reward")
plt.show()


In [None]:
plt.figure(figsize=(16, 7))
plt.subplots_adjust(hspace=0.4)

subplot_num = 241

# loop over all the states in states_track dictionary
for i, (state, actions) in enumerate(states_track.items()):
    # loop over all the actions of each state
    for j, (action, q_values) in enumerate(actions.items()):
        xaxis = np.asarray(range(len(q_values)))
        plt.subplot(subplot_num)
        plt.plot(xaxis, np.asarray(q_values),label=f"{state} {action}")
        plt.ylabel("Q-value")
        plt.title(f"State: {state} \n Action: {action}")
        plt.grid(True)
        plt.legend()
        subplot_num += 1

plt.xlabel("Steps")
plt.suptitle("Q-value over Steps for Different States and Actions")
plt.show()