-
Notifications
You must be signed in to change notification settings - Fork 0
/
two_treasure.py
87 lines (77 loc) · 2.83 KB
/
two_treasure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from MDP import QLearning
import numpy as np
import pandas as pd
import time
np.random.seed(2) # reproducible
N_STATES = 7 # the length of the 1 dimensional world
INIT_STATE = N_STATES // 2
ACTIONS = ['left', 'right', 'stay'] # available actions
MAX_EPISODES = 13 # maximum episodes
FRESH_TIME = 0.4 # fresh time for one move
A_TREASURE = 0
B_TREASURE = N_STATES - 1
def next_player(player):
return 'A' if player == 'B' else 'B'
def place_of_state(state):
return int(state[0])
def get_env_feedback(S, A, player):
# This is how agent will interact with the environment
if A == 'left' and int(S[0]) > A_TREASURE:
S_ = str(int(S[0])-1)
elif A == 'right' and int(S[0]) < B_TREASURE:
S_ = str(int(S[0])+1)
else:
S_ = S[0]
S_ += next_player(player)
if place_of_state(S_) == A_TREASURE or place_of_state(S_) == B_TREASURE:
if (player == 'A' and place_of_state(S_) == A_TREASURE) or (player == 'B' and place_of_state(S_) == B_TREASURE):
R = 1
else:
R = -1
S_ = 'terminal'
else:
R = 0
return S_, R
def update_env(S, episode, step_counter, player, win_lose=''):
# This is how environment be updated
env_list = ['A'] + ['-']*(N_STATES-2) + ['B'] # 'A--------B' our environment
if S == 'terminal':
interaction = 'Episode %s: total_steps = %s ' % (episode+1, step_counter) + win_lose
print('\r{}'.format(interaction))#, end='')
# time.sleep(2)
# print('\r ', end='')
else:
env_list[place_of_state(S)] = player.lower()
interaction = ''.join(env_list)
print('\r{}'.format(interaction), end='')
time.sleep(FRESH_TIME)
def rl(start_player='A'):
# main part of RL loop
table = QLearning(ACTIONS)
for episode in range(MAX_EPISODES):
player = start_player
step_counter = 0
S = str(INIT_STATE) + player
is_terminated = False
update_env(S, episode, step_counter, player)
while not is_terminated:
if S != ''.join(S[0] + player):
print("sth, wrong!!", end='\n\n')
A = table.choose_action(S)
S_, R = get_env_feedback(S, A, player) # take action & get next state and reward
table.learn(S, A, R, S_)
S = S_ # move to next state
win_lose = player + ' '
win_lose += 'win' if R == 1 else 'lose'
update_env(S, episode, step_counter+1, player, win_lose)
if S_ == 'terminal':
is_terminated = True
step_counter += 1
player = next_player(player)
print('\n', table.q_table.sort_index(axis=0), end='\n\n')
time.sleep(2)
return table
if __name__ == "__main__":
q_table = rl()
print('\r\nQ-table:\n')
print(q_table)