> # Template for DQN

In [None]:
import gym 
import copy
import random 
from collections import deque
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
class DQNModel(torch.nn.Module):
    def __init__(self, observation_space, action_space, hidden1 = 16, hidden2 = 126, hidden3 = 32):
        super(DQNModel, self).__init__()
        self.fc1 = nn.Linear(observation_space, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, hidden3)
        self.fc4 = nn.Linear(hidden3, action_space)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = F.relu(self.fc3(out))
        out = self.fc4(out)
        return out

In [None]:
def gen_epsilon_greedy_policy(estimator, n_action):
    def policy_function(state, epsilon):
        if random.random() < epsilon:
            return random.randint(0, n_action - 1)
        else:
            with torch.no_grad():
                q_values = estimator(torch.from_numpy(state).float())
            return torch.argmax(q_values).item()
    return policy_function

def train(optimizer, loss_fn, x, y):
    loss = loss_fn(x, Variable(torch.Tensor(y)))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
 

In [None]:
def main():

    # Initialize the Environment
    ENV_NAME = "MountainCar-v0"
    env = gym.make(ENV_NAME)
    # Define all the constants 
    EPOCHS = 500
    GAMMA = 0.99
    EPSILON = 1.0
    EPSILON_DECAY = 0.98
    LEARNING_RATE = 0.001

    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n 

    # Initialize the DL model and policy
    DQN = DQNModel(observation_space, action_space)
    policy = gen_epsilon_greedy_policy(DQN, action_space)
    # initialize the optimizer and loss function
    optimizer = torch.optim.Adam(DQN.parameters(), LEARNING_RATE)
    loss_fn = torch.nn.MSELoss()

    # Some empty list to hold intermediatery values. 
    STEPS_LIST = list()
    EPSILON_LIST = list()

    for EPOCH in range(EPOCHS):

        state = env.reset()
        done = False 
        STEPS = 0

        while (done == False):

            action = policy(state, EPSILON)
            state_next, reward, done, info = env.step(action)

            with torch.no_grad():
                newQ = DQN(torch.from_numpy(state_next).float())
                maxQ = torch.max(newQ)

            if reward == -1.0:
                y = reward + GAMMA * maxQ
            else:
                y = reward

            newQ[action] = y
            q_values = DQN(torch.from_numpy(state).float())
            train(optimizer, loss_fn, q_values, newQ)

            STEPS += 1

        STEPS_LIST.append(STEPS)
        EPSILON_LIST.append(EPSILON)
        print("Epoch: " + str(EPOCH) + ", exploration: " + str(EPSILON) + ", score: " + str(STEPS))
        EPSILON = max(EPSILON * EPSILON_DECAY, 0.01)

    torch.save(DQN.state_dict(), './DQN_MountianCar.pth')

    return STEPS_LIST, EPSILON_LIST

In [None]:
if __name__ == '__main__':
    steps_list, epsilon_list = main()

In [None]:
plt.plot(steps_list)
plt.plot(epsilon_list)
plt.title('Episode reward over time')
plt.xlabel('Episode')
plt.ylabel('Total reward')
plt.show()