In [1]:
from Environment import Environment
import numpy as np
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
from collections import Counter

import torch
from torch.autograd import Variable

In [4]:
c1, c2, c3, arm, last_l, last_r = list(range(0, 6))
empty, red, blue, green = list(range(0, 4))
actions = ["pickL", "pickM", "pickR", "putL", "putR"]


def updateQ(Q, s, a, r, sn, gamma, alpha=0.05):
    old_value = Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], actions.index(a)]
    optimal_fut_value = np.max(Q[sn[c1], sn[c2], sn[c3], sn[arm], sn[last_l], sn[last_r], :])
    update = (1.0 - alpha) * old_value + alpha * (r + gamma * optimal_fut_value)
    Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], actions.index(a)] = update

def train(iterations=10000, traj_len=100, alpha=1.0, alpha_decay=0.999,  \
          alpha_min=0.05, gamma=0.95, exp_decay=0.999, exp_min=0.5, stats=False):
    env = Environment()
    gamma = 0.95
    Q = np.ones((4, 4, 4, 4, 4, 4, 5))
    exp = 1.0
    for it in tqdm(range(iterations)):
        env.reset()
        s = env.get_current_state()
        for step in range(traj_len):
            if np.random.uniform(0, 1) <= max(exp, exp_min):
                a = actions[np.random.randint(0, 5)]
            else:
                a = actions[np.argmax(Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], :])]
            r = env.get_reward(s, a)
            sn = env.execute_action(a)
            updateQ(Q, s, a, r, sn, gamma, max(alpha, alpha_min))
            s = sn
            alpha = alpha * alpha_decay
            exp = exp * exp_decay
    return Q


In [5]:
Q = train(iterations=100000)
print(Q[1, 2, 3, :, 2, 1, :])

100%|██████████| 100000/100000 [03:33<00:00, 468.36it/s]

[[ 622.34556198  619.31449211  665.17898279  625.78784163  622.00692976]
 [   1.            1.            1.            1.            1.        ]
 [   1.            1.            1.            1.            1.        ]
 [   1.            1.            1.            1.            1.        ]]





In [4]:
def getTrajectory(Q, maxLength=200):
    env = Environment()
    traj_s = []
    traj_a = []
    traj_r = []
    s = env.get_current_state()
    for i in range(maxLength):
        a = actions[np.argmax(Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], :])]
        traj_s.append(s)
        traj_a.append(a)
        traj_r.append(env.get_reward(s, a))
        s = env.execute_action(a)
    return traj_s, traj_a, traj_r

def getTrajectories(Q, M=100, maxLength=50):
    trajs = []
    for i in range(M):
        trajs.append([getTrajectory(Q, maxLength)])
    return np.array(trajs).reshape(M,3,maxLength)

In [5]:
# print(Q[1, 2, 3, :, 0, 0, :])
# print(Q[1, 2, 3, :, 2, 1, :])

In [6]:
# rsum = 0
# counter = 0
# alog = []
# for i in range(0, 10000):
#     counter += 1
#     s = np.copy(s)
#     a = actions[np.argmax(Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], :])]
#     alog.append(actions.index(a))
#     r = env.get_reward(s, a)
#     rsum += r
#     print((str(s) + ' | ' + a + ' | ' + str(r) + ' | {:.2f}'.format(rsum/counter)), end='\r')
#     s = env.execute_action(a)
#     # time.sleep(0.8)
    
# plt.hist(alog, bins=5, normed=1)
# plt.show()

In [7]:
env = Environment()
# initial state distribution
D = env.get_initial_state()

In [8]:
def state_action_to_fs(s, a):
    f = []
    for sv in range(len(s)):
        sv_one_hot = [0] * 4
        sv_one_hot[s[sv]] = 1
        f += sv_one_hot
    a_one_hot = [0] * 5
    a_one_hot[actions.index(a)] = 1
    f += a_one_hot
    return np.array(f)

def traj_to_fss(traj):
    res = []
    for i in range(traj.shape[1]):
        s = traj[0,i]
        a = traj[1,i]
        res.append(state_action_to_fs(s, a))
    return np.array(res)

def trajs_to_fss(trajs):
    res = []
    for i in range(len(trajs)):
        res.append(traj_to_fss(trajs[i]))
    return np.array(res)
    
trajs = getTrajectories(Q, M=500)
print(state_action_to_fs(trajs[0,0,0], trajs[0,1,0]))
print('')
print(traj_to_fss(trajs[0,:,:])[0:2])

[0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0]

[[0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0]
 [0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0]]


In [9]:
def f_zeta(traj):
    res = np.zeros(29)
    for i in range(traj.shape[1]):
        s = traj[0,i]
        a = traj[1,i]
        res += state_action_to_fs(s, a)
    res /= traj.shape[1]
    return res

def f_zetas(trajs):
    res = []
    for i in range(trajs.shape[0]):
        traj = trajs[i,:,:]
        res.append(f_zeta(traj))
    res = np.array(res)
    return res

print(f_zeta(trajs[0,:,:]))
print()
print(f_zetas(trajs))[0:2]

[ 0.    0.    1.    0.    0.    0.48  0.24  0.28  0.    0.    1.    0.    0.5
  0.22  0.14  0.14  0.    0.72  0.16  0.12  0.    0.    0.48  0.52  0.
  0.48  0.02  0.28  0.22]
()
[[ 0.    0.    1.    0.    0.    0.48  0.24  0.28  0.    0.    1.    0.
   0.5   0.22  0.14  0.14  0.    0.72  0.16  0.12  0.    0.    0.48  0.52
   0.    0.48  0.02  0.28  0.22]
 [ 0.    0.    1.    0.    0.    0.44  0.14  0.42  0.    0.04  0.96  0.
   0.5   0.2   0.1   0.2   0.    0.76  0.08  0.16  0.    0.    0.12  0.88
   0.    0.46  0.04  0.28  0.22]]


In [10]:
def expexted_emp_feat_count(trajs):
    res = np.zeros(29)
    for i in range(trajs.shape[0]):
        traj = trajs[i,:,:]
        res += f_zeta(traj)
    res /= trajs.shape[0]
    return res

print(expexted_emp_feat_count(trajs))

[ 0.       0.03056  0.94992  0.01952  0.       0.40064  0.30908  0.29028
  0.       0.05888  0.89432  0.0468   0.5      0.16312  0.16924  0.16764
  0.       0.67656  0.09104  0.2324   0.       0.0024   0.4764   0.5212
  0.01128  0.43144  0.05728  0.25596  0.24404]


In [11]:
class Rpred(torch.nn.Module):
    def __init__(self):
        super(Rpred, self).__init__()
        self.lin1 = torch.nn.Linear(29, 200)
        self.lin2 = torch.nn.Linear(200, 200)
        self.lin3 = torch.nn.Linear(200, 1)
        self.tanh = torch.nn.Tanh()
        
    def forward(self, x):
        out = self.lin1(x).clamp(min=0)
        out = self.lin2(out).clamp(min=0)
        out = self.tanh(self.lin3(out))
        return out
    
rpred = Rpred()
optimizer = torch.optim.Adagrad(rpred.parameters())

In [12]:
fss_trajs = trajs_to_fss(trajs)
# print(fss_trajs.shape)
fss_trajs = Variable(torch.from_numpy(fss_trajs)).float()

for i in range(0, 250):
    rpreds = rpred(fss_trajs).view(fss_trajs.data.shape[0],fss_trajs.data.shape[1])
    # print(rpreds.data[0,:5])

    R_of_zeta = torch.sum(rpreds, dim=1)

    objective = (1.0 / fss_trajs.data.shape[0]) * torch.sum(R_of_zeta) - torch.log(torch.sum(torch.exp(R_of_zeta)))
    loss = -objective
    if i % 49 == 0:
        print(loss.data[0])

    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

6.23168182373
6.21478939056
6.21471595764
6.21466732025
6.21465778351
6.21464061737


In [13]:
print(fss_trajs.data.shape)
res = rpred(fss_trajs)
print(((res-torch.min(res))/(torch.max(res)-torch.min(res))).data[0, 0:10,0])
print(trajs[0,0,0:10])
print(trajs[0,1,0:10])
print(trajs[0,2,0:10])


torch.Size([500, 50, 29])

 0.1496
 0.7039
 0.2371
 0.5003
 0.2850
 0.4999
 0.1589
 0.4418
 0.2356
 0.6477
[torch.FloatTensor of size 10]

[array([2, 2, 2, 0, 3, 2]) array([2, 3, 2, 2, 3, 2])
 array([2, 3, 2, 0, 2, 2]) array([2, 2, 2, 3, 2, 2])
 array([2, 2, 2, 0, 2, 3]) array([2, 1, 2, 2, 2, 3])
 array([2, 1, 2, 0, 2, 2]) array([2, 2, 2, 1, 2, 2])
 array([2, 2, 2, 0, 1, 2]) array([2, 3, 2, 2, 1, 2])]
['pick2' 'putL' 'pick2' 'putR' 'pick2' 'putR' 'pick2' 'putL' 'pick2' 'putR']
[0 30 0 60 0 40 0 110 0 40]


In [14]:
print(res.data.numpy().reshape(500, 50).shape)
print(trajs.shape)
print(trajs[0,2,0:10])
trajs[:,2,:] = ((res-torch.min(res))/(torch.max(res)-torch.min(res))).data.numpy().reshape(500, 50)
print(trajs[0,2,0:10])

(500, 50)
(500, 3, 50)
[0 30 0 60 0 40 0 110 0 40]
[0.14955051243305206 0.7038653492927551 0.23709285259246826
 0.5003016591072083 0.28498154878616333 0.49986353516578674
 0.1589118391275406 0.4418087899684906 0.23557858169078827
 0.6476856470108032]


In [22]:
def trainFromRpred(rpred, iterations=10000, traj_len=100, alpha=1.0, alpha_decay=0.999,  \
          alpha_min=0.01, gamma=0.95, exp_decay=0.999, exp_min=0.5):
    env = Environment()
    gamma = 0.95
    Q = np.ones((4, 4, 4, 4, 4, 4, 5))
    exp = 1.0
    for it in tqdm(range(iterations)):
        env.reset()
        s = env.get_current_state()
        for step in range(traj_len):
            if np.random.uniform(0, 1) <= max(exp, exp_min):
                a = actions[np.random.randint(0, 5)]
            else:
                a = actions[np.argmax(Q[s[c1], s[c2], s[c3], s[arm], s[last_l], s[last_r], :])]
            f = state_action_to_fs(s, a)
            r = rpred(Variable(torch.from_numpy(f)).float())
            sn = env.execute_action(a)
            updateQ(Q, s, a, r, sn, gamma, max(alpha, alpha_min))
            s = sn
            alpha = alpha * alpha_decay
            exp = exp * exp_decay
    return Q

In [23]:
print(Q[1, 2, 3, :, 2, 1, :])
Qirl = trainFromRpred(rpred, iterations=100000)
print(Qirl[1, 2, 3, :, 2, 1, :])

  0%|          | 0/100000 [00:00<?, ?it/s]

[[ 587.59561124  559.69719939  589.26943229  567.06267407  565.82049886]
 [ 610.60765574  613.05403136  610.41672609  642.65827027  567.0862429 ]
 [ 547.2628839   548.92877154  539.04725131  538.25218967  597.47123372]
 [ 475.13891219  491.39377715  476.11314133  544.31853382  369.36045139]]





ValueError: setting an array element with a sequence.

In [17]:
trajs = getTrajectories(Q, M=500)
trajsirl = getTrajectories(Q, M=500)
print(np.mean(trajs[:,2,:]))
print(np.mean(trajsirl[:,2,:]))

32.0772
31.7352


In [24]:
# def freq_sPrime_after(trajs, s, a, sPrime):
#     if a.startswith("pick"):
#         return 1.0/3
#     else:
#         return 1.0
#     # TODO: use commented code and populate in-memory transition matrix
# #     successors = []
# #     for traj in trajs:
# #         for i in range(traj.shape[1] - 1):
# #             if np.array_equal(traj[0,i], s) and traj[1,i] == a:
# #                 successors.append(traj[0,i+1])
# #     successors = np.array(successors)
# #     count_sPrime = len(np.where((successors == sPrime).all(axis=1))[0])
# #     freq = float(count_sPrime) / len(successors)
# #     return freq
    
# r = []
# for i in range(trajs.shape[2]-1):
#     r.append(np.log(freq_sPrime_after(trajs, trajs[0, 0, i], trajs[0, 1, i], trajs[0, 0, i+1])))
# r = np.array(r)
# np.sum(r)

In [43]:
s = [1, 2, 3, 3, 2, 1]
exp_val = Q[s[0], s[1], s[2], s[3], s[4], s[5], actions.index("putR")]
possible_vals = Q[s[0], s[1], s[2], s[3], s[4], s[5], :]
best_val = np.max(possible_vals)
quality = exp_val / best_val
print(possible_vals)
print(exp_val)
print(best_val)
print(quality)

[ 475.13891219  491.39377715  476.11314133  544.31853382  369.36045139]
369.360451392
544.318533819
0.678574085657
