In [1]:
import numpy as np
import itertools
from collections import defaultdict
from matplotlib import pyplot as plt
import pickle

from environment import TrafficEnv
from ml_controller import Controller

In [2]:
def make_epsilon_greedy_policy(Q, epsilon, num_actions):
    def policy_fn(observation):
        actions = np.ones(num_actions, dtype=float) * epsilon / num_actions
        best_action = np.argmax(Q[observation])
        actions[best_action] += (1.0 - epsilon)
        return actions
    return policy_fn

In [3]:
def q_learning(Q, env, num_episodes, group_size, disc_factor=0.95, alpha=0.3, epsilon=0.001):
    stats = {'episode_lengths': np.zeros(num_episodes)}
    
    policy = make_epsilon_greedy_policy(Q, epsilon, env.num_actions)
    avg_len = 0
    
    for i_ep  in range(num_episodes):
        # Display episode number every so often
        if (i_ep + 1) % group_size == 0:
            prev_avg = avg_len
            avg_len = sum(stats['episode_lengths'][i_ep - group_size - 1:i_ep - 1]) / group_size
            avg_diff = avg_len - prev_avg
            print("\rEpisode {}/{}, Latest Average {}, Average change {}".format(
                i_ep + 1, num_episodes, avg_len, avg_diff), end="")
        
        prev_state = tuple(env.reset())
        
        for t in itertools.count():
            action_probs = policy(prev_state)
            action = np.random.choice(np.arange(env.num_actions), p=action_probs)
            reward, done = env.step(action)
            next_state = tuple(env.state)
            
            stats['episode_lengths'][i_ep] = t
            
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + disc_factor * Q[next_state][best_next_action]
            td_delta = td_target - Q[prev_state][action]
            Q[prev_state][action] += alpha * td_delta
            
            # Cap at 50000
            if t > 50000:
                break
            
            if done:
                break
            
            prev_state = next_state
    return Q, stats

In [4]:
def dd():
    return np.zeros(6)

In [None]:
max_cars = 10
episode_length = 100
group_size = 10

In [6]:
env = TrafficEnv()
controller = Controller(env, max_cars)

In [7]:
env.car_density = [0.5, 0.5, 0.5, 0.5]

In [8]:
# Q = defaultdict(dd)
Q = pickle.load(open('policy.p', 'rb'))

In [None]:
for _ in range(100):
    Q, stats = q_learning(Q, controller, episode_length, group_size)
    pickle.dump(Q, open('policy.p', 'wb'))

Episode 100/100, Latest Average 4111.0, Average change -45890.00

In [None]:
ep_len = stats['episode_lengths']
avg_ep_len_arr = []
for ep_range in range(0, episode_length, group_size):
    avg_ep_len = sum(ep_len[ep_range:ep_range+group_size]) / group_size
    avg_ep_len_arr.append(avg_ep_len)

In [None]:
plt.plot(list(range(len(avg_ep_len_arr))), avg_ep_len_arr)

In [None]:
pickle.dump(Q, open('policy.p', 'wb'))

In [None]:
sim_env = TrafficEnv()
sim = Controller(sim_env, max_cars)

In [None]:
total_cars = 0
sim.reset()
for _ in range(500):
    best_action = np.argmax(Q[sim.state])
    print(f'The lane state is {sim.lane_state}')
    print(f'The state is {sim.state}')
    rewards, done  = sim.step(best_action)
    total_cars += sum(sim.lane_state)
    print(f'The action returns are {list(Q[sim.state])}')
    print(f'The action taken was {best_action}')
    print()
    if done:
        print('Model ended prematurely')
        break

print(f'The model survived for {_} seconds')
avg_cars = total_cars/_
print(f'Average cars during runtime: {avg_cars}')