In [61]:
import gym
import time
import highway_env
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import copy
import numpy as np
from collections import deque, namedtuple
import matplotlib.pyplot as plt
%matplotlib inline

In [33]:
env = gym.make("highway-v0")

In [34]:
state = env.reset()
done = False
score = 0
while not done:
#     time.sleep(0.001)
    env.render()
    action = env.action_space.sample()
    state, reward, done,_ = env.step(action)
    score += reward
    if done:
        break
        
print(score)

12.622222222222224


In [37]:
state = env.observation_space.high
state

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]], dtype=float32)

In [140]:
state = np.array(env.observation_space.sample())
state = torch.from_numpy(state).float().float().unsqueeze(0)
state = state.view(-1, 5*5)
state.shape

torch.Size([1, 25])

In [58]:
state_size = 5*5
action_size = env.action_space.n
action_size, state_size

(5, 25)

In [88]:
a = env.action_space.sample()
a

0

In [224]:
BUFFER_SIZE = int(1e5)
BATCH_SIZE = 64
GAMMA = 1.0
UPDATE_EVERY = 4
TAU = 1e-3
device = torch.device('cpu')

In [231]:
class Model(nn.Module):
    def __init__(self, state_size, action_size):
        super(Model, self).__init__()
        
        self.seed = torch.manual_seed(30)
        
        self.fc1 = nn.Linear(state_size, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, action_size)
        
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, state):
        
        state = F.relu(self.fc1(state))
        state = self.dropout(state)
        state = F.relu(self.fc2(state))
        state = self.dropout(state)
        
        action = F.softmax(self.fc3(state), dim=1)
        
        return action

In [232]:
class ReplayBuffer:
    def __init__(self, buffer_size):
        self.seed = random.seed(30)
        
        self.memory = deque(maxlen=BUFFER_SIZE)
        self.experiences = namedtuple("Experiences", field_names=["state","action","reward","new_state","done"])
        
    def add(self, state, action, reward, new_state, done):
        e = self.experiences(state, action, reward, new_state, done)
        self.memory.append(e)
        
    def sample(self, batch_size):
        experiences = random.sample(self.memory, k=batch_size)
        
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        new_states = torch.from_numpy(np.vstack([e.new_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        
        return states, actions, rewards, new_states, dones
        
    def __len__(self):
        return len(self.memory)

In [233]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.seed = random.seed(30)
        self.action_size = action_size
        self.local_nn = Model(state_size, action_size).to(device)
        self.target_nn = Model(state_size, action_size).to(device)
        
        self.optimizer = optim.Adam(self.local_nn.parameters(), lr=0.01)
        
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.t_step = 0
        
    def step(self, state, action, reward, new_state, done):
        state = torch.from_numpy(state).float().float().unsqueeze(0)
        state = state.view(-1, 5*5)
        
        new_state = torch.from_numpy(new_state).float().unsqueeze(0)
        new_state = new_state.view(-1, 5*5)
        
        self.memory.add(state, action, reward, new_state, done)
        
        self.t_step += self.t_step % UPDATE_EVERY
        
        if self.t_step == 0:
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, GAMMA)
        
    def act(self, state, eps):
        state = torch.from_numpy(state).float().float().unsqueeze(0)
        state = state.view(-1, 5*5)
        
        self.local_nn.eval()
        with torch.no_grad():
            action_values = self.local_nn(state)
        self.local_nn.train()
        
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
        
    def learn(self, experiences, gamma):
        states, actions, rewards, new_states, dones = experiences
        
        q_target = self.target_nn(new_states).detach().max(1)[0].unsqueeze(1)
        q_target_value = rewards + gamma*q_target*(1-dones)
        
        q_expected = self.local_nn(states).gather(1, actions)
        
        loss = F.mse_loss(q_expected, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.soft_update(self.local_nn, self.target_nn, TAU)
        
    def soft_update(self,local_nn, target_nn, tau):
        for target_param, local_param in zip(target_nn.parameters(), local_nn.parameters()):
            target_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data)

In [234]:
agent = DQNAgent(state_size, action_size)

In [237]:
def dqn():
    n_episodes = 1000
    scores_deque = deque(maxlen=100)
    scores = []
    eps = 1.0
    min_eps = 0.005
    decay = 0.995
    max_score = -np.Inf
    
    for i_episode in range(n_episodes):
        state = env.reset()
        score = 0 
        done = False
        while not done:
            env.render()
            action = agent.act(state, eps)
            print('\r action {}'.format(action), end="")
            new_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, new_state, done)
            state = new_state
            score += reward
            if done:
                break
                
        scores_deque.append(score)
        scores.append(score)
        
        eps = max(eps*decay, min_eps)
        
        print('\rEpisode: {}\t Score: {}'.format(i_episode, np.mean(scores_deque)), end="")
        
        if i_episode % 10 == 0 and i_episode>9:
            if np.mean(scores_deque) > max_score:
                max_score = np.mean(scores_deque)
                torch.save(agent.local_nn.state_dict(), 'checkPoint.pth')
            print('\rEpisode: {}\t Score: {}'.format(i_episode, np.mean(scores_deque)))
            
    return scores

In [238]:
scores = dqn()



Episode: 10	 Score: 22.391919191919197
Episode: 20	 Score: 20.077248677248683
Episode: 30	 Score: 19.487455197132622
Episode: 40	 Score: 21.401626016260174
Episode: 50	 Score: 20.282352941176477
Episode: 60	 Score: 20.333697632058293
Episode: 70	 Score: 20.415962441314563
Episode: 80	 Score: 20.249108367626892
Episode: 90	 Score: 20.911599511599527
Episode: 100	 Score: 21.17822222222223
Episode: 110	 Score: 21.152222222222225
Episode: 120	 Score: 21.645333333333344
Episode: 130	 Score: 22.544666666666675
Episode: 140	 Score: 22.295555555555566
Episode: 150	 Score: 23.423111111111126
Episode: 160	 Score: 24.105555555555567
Episode: 170	 Score: 23.929555555555567
Episode: 180	 Score: 24.913333333333345
Episode: 190	 Score: 24.403111111111125
Episode: 200	 Score: 24.336000000000014
Episode: 210	 Score: 24.747333333333348
Episode: 220	 Score: 24.253111111111122
Episode: 230	 Score: 24.189777777777785
Episode: 240	 Score: 23.878444444444447
Episode: 250	 Score: 23.905777777777798
 action 02

KeyboardInterrupt: 

In [195]:
agent.local_nn.load_state_dict(torch.load('checkPoint.pth'))
agent.local_nn.eval()
state = env.reset()
done = False
score = 0
while not done:
#     time.sleep(0.001)
    env.render()
    state = torch.from_numpy(state).float().float().unsqueeze(0)
    state = state.view(-1, 5*5)
    
    action_values = agent.local_nn(state)
    action = random.choice(np.arange(3))
    print(action, end="")
    state, reward, done,_ = env.step(action)
    score += reward
    if done:
        break
        
print(score)

112022100128.31111111111111
