In [None]:
# DQN Model for Reinforcement Learning

from collections import deque
from gym.spaces import Box
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from CrimeWorld import CrimeWorld

import random
import gym
import time
import csv
import numpy as np
import matplotlib.pyplot as plt

EPISODES = 1000

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size[0]
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Conv2D(4, input_shape=(self.state_size,self.state_size,1), kernel_size=(4,4), strides=(2,2), padding='valid', activation='relu'))
        model.add(Conv2D(8, kernel_size=(4,4), strides=(2,2), padding='valid', activation='relu'))
        model.add(Flatten())
        model.add(Dense(32, activation='relu'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [None]:
if __name__ == "__main__":
    # initialize the crime world
    env = CrimeWorld()
    initial_policeX = 60
    initial_policeY = 60
    # adding agent into the world
    env.add_agent(initial_policeX,initial_policeY)

    # current state size for the DQN
    police_state_size = env.get_state().shape

    # number of actions to perform by the agent
    action_size = 5
    done = False
    batch_size = 32
    
    # file to save the results
    with open("results_episodes", mode='w+') as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerows([["episode","policeX","policeY","action","reward","new_B","new_n"]])

In [None]:
    # Initialize the DQN and build the model
    agent = DQNAgent(police_state_size, action_size)

    # saving each episode data
    save_episode = []
    for e in range(EPISODES):
        # reset the world to initial conditions and matrices
        state = env.reset()

        state = np.reshape(state, (1, police_state_size[0], police_state_size[1],1))
        for time_step in range(1000):
            # generate an action by the model for getting the reward from the environment
            action = agent.act(state)

            # get the reward and future possible rewards to the agent 
            reward, next_state, done = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, (1, police_state_size[0],police_state_size[1],1))

            # agent remembers its previous steps to generate and predict the future possibilities
            agent.remember(state, action, reward, next_state, done)
            state = next_state

            # saving every time step data to feed into the episodes
            save_episode.append([e, env.policeX, env.policeY , action, reward, env.result_matrixsum(action, reward)[0], env.result_matrixsum(action, reward)[1]])
            
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
            
            # for george: video code
            # if(e==99):
                # policeX_axis.append(env.policeX)
                # policeY_axis.append(env.policeY)
                # plt.imshow(env.B)
                # plt.plot(policeY_axis, policeX_axis, color='green')
                # plt.gca().invert_yaxis()
                # plt.savefig("images/" + str(time_step) + '.png')
                # display.clear_output(wait=True)
                # display.display(plt.gcf())

        # saving every episode for data prediction and generalize the model
        if(e%20==19):
            with open("results_episodes", mode='a') as csv_file:
                writer = csv.writer(csv_file, delimiter=',')
                writer.writerows(save_episode)
            save_episode = []