In [1]:
from Environment import Environment
import numpy as np
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

In [2]:
c1, c2, c3, arm, last_l, last_r = list(range(0, 6))
empty, red, blue, green = list(range(0, 4))
actions = ["pick1", "pick2", "pick3", "putL", "putR"]


def updateQ(Q, s, a, r, sn, gamma, alpha=0.05):
    old_value = Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], actions.index(a)]
    optimal_fut_value = np.max(Q[sn[c1], sn[c2], sn[c3], sn[arm], sn[last_l], sn[last_r], :])
    update = (1.0 - alpha) * old_value + alpha * (r + gamma * optimal_fut_value)
    Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], actions.index(a)] = update

def train(iterations=10000, traj_len=100, alpha=1.0, alpha_decay=0.999,  \
          alpha_min=0.01, gamma=0.95, exp_decay=0.999, exp_min=0.5):
    env = Environment()
    gamma = 0.95
    Q = np.ones((4, 4, 4, 4, 4, 4, 5))
    exp = 1.0
    for it in tqdm(range(iterations)):
        env.reset()
        s = env.get_current_state()
        for step in range(traj_len):
            if np.random.uniform(0, 1) <= max(exp, exp_min):
                a = actions[np.random.randint(0, 5)]
            else:
                a = actions[np.argmax(Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], :])]
            r = env.get_reward(s, a)
            sn = env.execute_action(a)
            updateQ(Q, s, a, r, sn, gamma, max(alpha, alpha_min))
            s = sn
            alpha = alpha * alpha_decay
            exp = exp * exp_decay
    return Q


In [3]:
Q = train()
print(Q[1, 2, 3, :, 2, 1, :])

100%|██████████| 10000/10000 [00:23<00:00, 425.40it/s]

[[  58.67682122    3.8034111     4.99338698    8.60612388    5.55190119]
 [  18.69879499   13.28995953   16.41941059  145.09727608    6.32362966]
 [   1.20569088    1.23621512    1.29765906    1.3715994    15.09871113]
 [   1.6564378     1.37679154    1.74856131   30.34283655   -6.96930533]]





In [4]:
def getTrajectory(Q, maxLength=200):
    env = Environment()
    traj_s = []
    traj_a = []
    traj_r = []
    s = env.get_current_state()
    for i in range(maxLength):
        a = actions[np.argmax(Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], :])]
        traj_s.append(s)
        traj_a.append(a)
        traj_r.append(env.get_reward(s, a))
        s = env.execute_action(a)
    return traj_s, traj_a, traj_r

def getTrajectories(Q, n=100, maxLength=200):
    trajs = []
    trajs.append([getTrajectory(Q, maxLength)])

In [5]:
# print(Q[1, 2, 3, :, 0, 0, :])
# print(Q[1, 2, 3, :, 2, 1, :])

trajs = getTrajectories(Q)


In [6]:
# rsum = 0
# counter = 0
# alog = []
# for i in range(0, 10000):
#     counter += 1
#     s = np.copy(s)
#     a = actions[np.argmax(Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], :])]
#     alog.append(actions.index(a))
#     r = env.get_reward(s, a)
#     rsum += r
#     print((str(s) + ' | ' + a + ' | ' + str(r) + ' | {:.2f}'.format(rsum/counter)), end='\r')
#     s = env.execute_action(a)
#     # time.sleep(0.8)
    
# plt.hist(alog, bins=5, normed=1)
# plt.show()

In [7]:
# initial state distribution
# D = 

SyntaxError: invalid syntax (<ipython-input-7-bf52aa9678f9>, line 2)