### https://stackoverflow.com/questions/52885901/how-to-save-python-script-as-py-file-on-jupyter-notebook/52886052

In [1]:
from collections import deque
import numpy as np
import argparse
import os
import time

from termcolor import colored

In [2]:
####################
#         #           #  Goal  #
####################
#         #           # Hole   #
####################

### make a Q world!

In [3]:
class Qworld():
    def __init__(self):
        # row&col
        self.col=4 # col : number of actions
        self.row=6 # 2x3 grid game
        # tables
        self.q_table=np.zeros([self.row,self.col]) # what we wil get when we do certain 'action' at certain 'state'
        self.init_transition_table()
        self.init_reward_table()
        # parameteres
        self.gamma=0.9 # discount rate
        self.epsilon=0.9 # 90% explore, 10% exploit
        self.epsilon_decay=0.9 # decaying exploration rate
        self.epsilon_min=0.1
        self.reset()
        self.is_explore=True
    
    def reset(self):
        self.state=0
        return self.state
    
    def isin_win(self):
        return self.state ==2
    
    # what we get in the current state (x)
    # what we get in the current state+action (o)
    def init_reward_table(self): 
        self.reward_table[1,2] = 100. # in state1(=(0,1) of the grid), you get 100 when you do action2(=go right)
        self.reward_table[4,2] = -100.
    
    def init_transition_table(self): # where the agent would be, when followed by the Q-table
        self.transition_table = np.zeros([self.row, self.col], dtype=int)
        self.transition_table[0,0]=0
        self.transition_table[0,1]=3
        self.transition_table[0,2]=1
        self.transition_table[0,3]=0
        
        self.transition_table[1,0]=0
        self.transition_table[1,1]=4
        self.transition_table[1,2]=2
        self.transition_table[1,3]=1
        
        # trainsition_table[2,n] : goal ( no state change )
        self.transition_table[2,0]=2
        self.transition_table[2,1]=2
        self.transition_table[2,2]=2
        self.transition_table[2,3]=2
        
        self.transition_table[3,0]=3
        self.transition_table[3,1]=3
        self.transition_table[3,2]=4
        self.transition_table[3,3]=0
        
        self.transition_table[4,0]=3
        self.transition_table[4,1]=4
        self.transition_table[4,2]=5
        self.transition_table[4,3]=1
        
        # trainsition_table[5,n] : hole ( no state change )
        self.transition_table[5,0]=5
        self.transition_table[5,1]=5
        self.transition_table[5,2]=5
        self.transition_table[5,3]=5
    
    def step(self,action)    :
        next_state = self.transition_table
        done = (next_state==2 or next_state==5)
        reward = self.reward_table[self.state, action]
        self.state = next_state
        return next_state, reward, done
    
    def act(self):
        # explore
        if np.random.rand() <= self.epsilon: # return 0~1
            self.is_explore = True
            return np.random.choice(4,1)[0]
        
        # exploit
        self.is_explore = False
        return np.argmax(self.q_table[self.state]) # take the action with biggest return on the certain row(=certain state)
    
    def update_q_table(self,state,action,reward,next_state):
        q_value = self.gamma*np.amax(self.q_table[next_state])
        q_value = reward+q_value
        self.q_table[state,action]=q_value
        
    def print_q_table(self):
        print("Q-Table(Epsilon : %0.2f)" % self.epsilon)
        print(self.q_table)
    
    def update_epsilon(self):
        if self.epsilon > self.epsilon_min: # no further decaying after the min value
            self.epsilon *= self.epsilon_decay 

In [7]:
# Showing Episode
def print_episode(episode, delay=1):
    os.system('clear')
    for _ in range(13):
        print('=', end='')
    print("")
    print("Episode ", episode)
    for _ in range(13):
        print('=', end='')
    print("")
    time.sleep(delay)

In [5]:
# Showing Status
def print_status(q_world, done, step, delay=1):
    os.system('clear')
    q_world.print_world(action, step)
    q_world.print_q_table()
    if done:
        print("-------EPISODE DONE--------")
        delay *= 2
    time.sleep(delay)

In [6]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    help_ = "Trains and show final Q Table"
    parser.add_argument("-t",
                        "--train",
                        help=help_,
                        action='store_true')
    args = parser.parse_args()

    if args.train:
        maxwins = 2000
        delay = 0
    else:
        maxwins = 10
        delay = 1

    wins = 0
    episode_count = 10 * maxwins
    # scores (max number of steps before goal) - good indicator of learning
    scores = deque(maxlen=maxwins)
    q_world = QWorld()
    step = 1

    # state, action, reward, next state iteration
    for episode in range(episode_count):
        state = q_world.reset()
        done = False
        print_episode(episode, delay=delay)
        while not done:
            action = q_world.act()
            next_state, reward, done = q_world.step(action)
            q_world.update_q_table(state, action, reward, next_state)
            print_status(q_world, done, step, delay=delay)
            state = next_state
            # if episode is done, perform housekeeping
            if done:
                if q_world.is_in_win_state():
                    wins += 1
                    scores.append(step)
                    if wins > maxwins:
                        print(scores)
                        exit(0)
                # Exploration-Exploitation is updated every episode
                q_world.update_epsilon()
                step = 1
            else:
                step += 1

    print(scores)
    q_world.print_q_table()

usage: ipykernel_launcher.py [-h] [-t]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\samsung\AppData\Roaming\jupyter\runtime\kernel-53dbbf53-dbe9-4300-a00a-e8c996464f8d.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
