In [None]:
import torch
import tensorflow
import gym
import keras
import random

import numpy as np
from gym import Env

from gym.spaces import Discrete, Box

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

# get training envs
import pandas as pd
all_puzzles = pd.read_csv("sudoku.csv")
all_puzzles_df = pd.DataFrame(all_puzzles)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Reshape
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

import time

In [None]:
class board_player_env(Env):
    def __init__(self, game_no = np.random.randint(100000)):
        super(board_player_env, self).__init__()
        # creating the action space
        # self.game_no = game_no
        random.seed(time.perf_counter())
        game_no = np.random.randint(100000)
        
        print("game being played now:", game_no)
        self.action_space_list = ['up', 'down', 'left', 'right', 1, 2, 3, 4, 5, 6, 7, 8, 9]
        self.action_space = Discrete(len(self.action_space_list))
        # number of valid actions
        self.n_actions = len(self.action_space_list)
        self.n_features = 2

        # observation space
        self.observation_space = Box(low=0, high=9, shape=(9, 9), dtype=int)

        # initializing environment
        self.puzzle = all_puzzles_df.iloc[game_no]['quizzes']
        self.puzzle_arr = np.array(list(self.puzzle), dtype=int).reshape((9, 9))

        # initializing perfect solution
        self.solution = all_puzzles_df.iloc[game_no]['solutions']
        self.solution_arr = np.array(list(self.solution), dtype=int).reshape((9, 9))

        # initializing the agent at the first index
        self.agent = [0, 0]

        # initializing rewards
        self.rewards = 0
        self.done = False

    def render(self):
        print("\nsolved now: \n")
        print(self.puzzle_arr)

    def reset(self):
        # set the agent at first location
        # self.state = [0, 0]
        
        # maybe it shoulf be there/maybe not
        # self.rewards = 0
        # self.done = False
        return [0, 0]

    def step(self, action):

        # current agent location
        state = self.agent
        # print("self.agent: ", self.agent)
        # print("state: ", state[0])
        # print("action: ", action, type(action))

        # rewards at each step
        # reward += 1 for reaching empty
        # reward += 3 for filling one spot correctly
        # reward += 10 for jackpot of full puzzle
        # punishment += -2 for filling wrong

        # action 0, 1, 2, 3
        # imply only some filling in rules of the sudoku game, so that the board doesn't change
        if (action == 0):
            # print("\nup")
            if(state[0] > 0):
                self.agent = [state[0] - 1, state[1]]
                if(self.puzzle_arr[state[0] - 1, state[1]] == 0):
                    print("\nreward for up")
                    self.rewards += 1
                else:
                    print("\npunishment for up")
                    self.rewards -= 1

        elif (action == 1):
            # print("\ndown")
            if(state[0] < 8):
                self.agent = [state[0] + 1, state[1]]
                if(self.puzzle_arr[state[0] + 1, state[1]] == 0):
                    print("\nreward for down")
                    self.rewards += 1
                else:
                    print("\npunishment for down")
                    self.rewards -= 1

        elif (action == 2):
            # print("\nleft")
            if(state[1] > 0):
                self.agent = [state[0], state[1] - 1]
                if(self.puzzle_arr[state[0], state[1] - 1] == 0):
                    print("\nreward for left")
                    self.rewards += 1
                else:
                    print("\npunishment for left")
                    self.rewards -= 1

        elif (action == 3):
            # print("\nright")
            if(state[1] < 8):
                self.agent = [state[0], state[1] + 1]
                if(self.puzzle_arr[state[0], state[1] + 1] == 0):
                    print("\nreward for right")
                    self.rewards += 1
                else:
                    print("\npunishment for right")
                    self.rewards -= 1

        # action 5 means fill 2
        elif (action > 3):
            if(self.puzzle_arr[state[0], state[1]] == 0):
                if(self.action_space_list[action] == self.solution_arr[state[0], state[1]]):
                    self.puzzle_arr[state[0], state[1]] = self.action_space_list[action]
                    print("\nreward for action:", self.action_space_list[action])
                    self.rewards += 3
                else:
                    print("\npunishment for action:", self.action_space_list[action])
                    self.rewards -= 2

        if(np.array_equal(self.solution_arr, self.puzzle_arr)):
            self.rewards += 10
            self.done = True
            self.render()
            # maybe/maybe not
            self.__init__()
            

        return self.agent, self.rewards, bool(self.done), {}

In [None]:
def build_model(states, actions):
    model = Sequential()
    # model.add(Reshape(target_shape=(actions,), input_shape=states))
    model.add(Dense(24, activation='relu', input_shape=(1, 2, )))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions))
    model.add(Flatten())
    return model

In [None]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-3)
    return dqn

```python
# do not run in ipynb
episodes = 100
for run in range(0, 10):
    env = board_player_env(run)
    for episode in range(0, episodes):
        state = env.reset()
        done = False
        score = 0
        while not done:
            action = env.action_space.sample()
            n_state, reward, done, _ = env.step(action)
            env.render()
            score += reward
        print("Episode: ", episode, "Score:", score)
```

In [None]:
env = board_player_env()
states = env.observation_space.shape
actions = env.action_space.n

In [None]:
model = build_model(states, actions)

In [None]:
model.summary()

In [None]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(board_player_env(), nb_steps=50000, visualize=False, verbose=1)

In [None]:
dqn.save_weights('dqn_weights.h5f', overwrite=True)

https://blog.paperspace.com/getting-started-with-openai-gym/

https://github.com/openai/gym/blob/master/gym/spaces/box.py

https://lilianweng.github.io/lil-log/2018/05/05/implementing-deep-reinforcement-learning-models.html