#### Thank you for viewing my first DQN Notebook.
#### I have some nortifications, plz see below.

・ This shows you Simple Starter model with KerasRL.

・ I'm new to study RL.

・ This model shows simple output pattern (only goo, only choki, only pah ...) because of simple reword strategy.

・ I have not built nice reword strategy yet! (So, I've not submmited yet.)

・ I refered [this JP article](https://qiita.com/tanuk1647/items/7b8c2f0d09330cbfacd2).

#### Thank you!

In [None]:
%%bash
pip install 'kaggle-environments>=0.1.6'

In [None]:
!pip install git+git://github.com/wau/keras-rl2.git --upgrade --no-deps

In [None]:
%%writefile submission.py

import numpy as np
import pandas as pd
import random
import sys
from os import mkdir
from os.path import exists
from sys import exc_info
from enum import Enum, IntEnum, auto
from collections import Counter
from functools import reduce
import gym
from gym import spaces
from gym.utils import seeding
from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import TrainIntervalLogger, TrainEpisodeLogger
import matplotlib.pyplot as plt

import tensorflow as tf


class Hand(IntEnum):
    GOO = 0
    CHOKI = 1
    PAH = 2
    
    
class Strategy(Enum):
    RANDOM = auto()
    HUMAN = auto()
    
    
class RandomStrategy:
    hands = [Hand.GOO, Hand.CHOKI, Hand.PAH]

    def __init__(self, np_random):
        self.np_random = np_random

    def get_hand(self, user_hand):
        return self.np_random.choice(RandomStrategy.hands)

    def reset(self):
        pass


class HumanStrategy:
    ratio = [0.350, 0.317, 0.333]
    hands = [Hand.GOO, Hand.CHOKI, Hand.PAH]

    def __init__(self, np_random):
        self.np_random = np_random

    def get_hand(self, user_hand):
        return self.np_random.choice(HumanStrategy.hands, p=HumanStrategy.ratio)

    def reset(self):
        pass
    
    
class RPS(gym.Env):
    
    action_space = spaces.Discrete(3)
    reward_range = [0, 1]
    observation_space = spaces.Box(low=0, high=100, shape=(2,), dtype='float32')

    def __init__(self, strategy=Strategy.HUMAN):
        super().__init__()

        self.seed()

        self.strategy = None
        if strategy == Strategy.RANDOM:
            self.strategy = RandomStrategy(self.np_random)
            
        self.user_hands = []
        self.enemy_hands = []
        self.reset()

    def reset(self):
        self.user_hand = Hand.GOO
        self.enemy_hand = Hand.GOO
        self.strategy.reset()
        self.done = False
        observation = [self.user_hand, self.enemy_hand]
        return observation

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        self.enemy_hand = self.strategy.get_hand(self.user_hand)
        self.enemy_hands += [self.enemy_hand]
        self.user_hand = Hand(action)
        self.user_hands += [self.user_hand]

        # draw
        if self.user_hand == self.enemy_hand:
            observation = [self.user_hand, self.enemy_hand]
            reward = 0
            self.done = True

        # win case
        elif self.user_hand == Hand.GOO and self.enemy_hand == Hand.CHOKI:
            observation = [self.user_hand, self.enemy_hand]
            reward = 10
            self.done = True

        elif self.user_hand == Hand.CHOKI and self.enemy_hand == Hand.PAH:
            observation = [self.user_hand, self.enemy_hand]
            reward = 10
            self.done = True

        elif self.user_hand == Hand.PAH and self.enemy_hand == Hand.GOO:
            observation = [self.user_hand, self.enemy_hand]
            reward = 10
            self.done = True

        # lose case
        elif self.user_hand == Hand.GOO and self.enemy_hand == Hand.PAH:
            observation = [self.user_hand, self.enemy_hand]
            reward = -10
            self.done = True

        elif self.user_hand == Hand.CHOKI and self.enemy_hand == Hand.GOO:
            observation = [self.user_hand, self.enemy_hand]
            reward = -10
            self.done = True            

        elif self.user_hand == Hand.PAH and self.enemy_hand == Hand.CHOKI:
            observation = [self.user_hand, self.enemy_hand]
            reward = -10
            self.done = True

        return observation, reward, self.done, {}

    
class TrainIntervalLogger2(TrainIntervalLogger):
    def __init__(self, interval=10000):
        super().__init__(interval=interval)
        self.records = {}

    def on_train_begin(self, logs):
        super().on_train_begin(logs)
        self.records['interval'] = []
        self.records['episode_reward'] = []
        for metrics_name in self.metrics_names:
            self.records[metrics_name] = []

    def on_step_begin(self, step, logs):
        if self.step % self.interval == 0:
            if len(self.episode_rewards) > 0:
                self.records['interval'].append(self.step // self.interval)
                self.records['episode_reward'].append(np.mean(self.episode_rewards))
                metrics = np.array(self.metrics)
                assert metrics.shape == (self.interval, len(self.metrics_names))
                if not np.isnan(metrics).all():  # not all values are means
                    means = np.nanmean(self.metrics, axis=0)
                    assert means.shape == (len(self.metrics_names),)
                    for name, mean in zip(self.metrics_names, means):
                        self.records[name].append(mean)
        super().on_step_begin(step, logs)
        

class DQNRPS:
    weightfile = 'dqn_{}_{}_weights.h5'

    def __init__(self, strategy=Strategy.HUMAN, recycle=True):
        print('creating model ...')
        self.train_interval_logger = None

        # Get the environment and extract the number of actions.
        self.env = RPS(strategy=strategy)
        self.env_name = 'rps'
        self.weightfile = DQNRPS.weightfile.format(self.env_name, str(strategy))
        self.nb_actions = self.env.action_space.n

        # Next, we build a very simple model.
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Flatten(input_shape=(1,) + self.env.observation_space.shape))
        self.model.add(tf.keras.layers.Dense(128))
        self.model.add(tf.keras.layers.Activation('relu'))
        self.model.add(tf.keras.layers.Dense(self.nb_actions))
        self.model.add(tf.keras.layers.Activation('linear'))

        # Finally, we configure and compile our agent.
        # You can use every built-in Keras optimizer and even the metrics!
        memory = SequentialMemory(limit=500, window_length=1)
        policy = BoltzmannQPolicy(tau=1.)
        self.dqn = DQNAgent(model=self.model, nb_actions=self.nb_actions, memory=memory,
                            nb_steps_warmup=10, target_model_update=1e-2, policy=policy)
        self.dqn.compile(tf.keras.optimizers.Adam(lr=1e-3), metrics=[])

        self.__istrained = False
        print('model created')

        if recycle:
            if exists(self.weightfile):
                try:
                    print('loading pretraining weight ...')
                    self.dqn.load_weights(self.weightfile)
                    self.__istrained = True
                    print('loaded pretrained weight')
                    return None
                except:
                    print('Unexpected error:', exc_info()[0])
                    raise
            else:
                pass

    # 訓練
    def train(self, nb_steps=3000, verbose=1, visualize=False, log_interval=300):
        if self.__istrained:
            raise RuntimeError('this model is already traine')

        print('training ...')

        callbacks = []
        if verbose == 1:
            self.train_interval_logger = TrainIntervalLogger2(interval=log_interval)
            callbacks.append(self.train_interval_logger)
            verbose = 0
        elif verbose > 1:
            callbacks.append(TrainEpisodeLogger())
            verbose = 0

        hist = self.dqn.fit(self.env, nb_steps=nb_steps,
                            callbacks=callbacks, verbose=verbose,
                            visualize=visualize, log_interval=log_interval)
        self.__istrained = True

        try:
            # After training is done, we save the final weights.
            self.dqn.save_weights(self.weightfile, overwrite=True)
        except:
            print('Unexpected error:', exc_info()[0])
            raise

        return hist

    def test(self, nb_episodes=10, visualize=False, verbose=1):
        hist = self.dqn.test(self.env, nb_episodes=nb_episodes,
                             verbose=verbose, visualize=visualize)
        return hist
    

self_actions = np.full(1001, -1, dtype=int)
oppo_actions = np.full(1001, -1, dtype=int)

d = DQNRPS(strategy=Strategy.RANDOM, recycle=False)

# Training few epochs to finish with in 61 seconds.
# https://www.kaggle.com/c/rock-paper-scissors/overview/environment-rules
h = d.train(nb_steps=500, log_interval=100, verbose=1)
d = DQNRPS(strategy=Strategy.RANDOM, recycle=True)
h = d.test(nb_episodes=1000, verbose=1) 
hands = [int(i) for i in d.env.user_hands]


def observe_and_predict(observation, configuration):
    
    step = observation.step
    global self_actions, oppo_actions
    global hands
    
    if step == 0:
        self_act = np.random.randint(3)
        self_actions[step] = self_act
        return self_act
    
    self_1s_bef = self_actions[step - 1]
    oppo_1s_bef = observation.lastOpponentAction
    oppo_actions[step - 1] = oppo_1s_bef
       
    
    if 1 <= step:
        self_act = hands[step]
        self_actions[step] = self_act
        return self_act

In [None]:
%%writefile random_agent.py
import numpy as np
def random_agent(observation, configuration):
    return np.random.randint(3)

In [None]:
from kaggle_environments import evaluate, make
venv = make("rps", configuration={"episodeSteps": 1000})

In [None]:
venv.reset()
venv.run(["submission.py", "random_agent.py"])
venv.render(mode="ipython", width=800, height=800)