In [3]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import pickle
import pandas as pd

## Humanoid v2

### Expert performance:
returns [10399.465466297353, 10337.698440078071, 10338.03251510115, 10363.347077257335, 10520.142508634532, 10437.992393534745, 10375.590303129682, 10427.51001606666, 10265.334918376444, 10383.854722941793, 10383.34055343508, 10392.749227673174, 10386.273548840532, 10377.634438183406, 10445.100629123428, 10468.288190410423, 10431.12064455722, 10522.290767465352, 10412.97541826066, 10371.666146306461] <br>
mean return 10402.020396283677 <br>
std of return 58.8302440319096

In [3]:
pickle_in = open("expert_data/Humanoid-v2.pkl","rb")
humanoid_expert = pickle.load(pickle_in)

In [4]:
humanoid_expert.keys()

dict_keys(['observations', 'actions'])

In [11]:
observations = humanoid_expert['observations']

In [12]:
actions = humanoid_expert['actions']

### Define model

In [13]:
observations.shape, actions.shape

((20000, 376), (20000, 1, 17))

In [16]:
actions = actions.reshape(-1, 17)

In [93]:
NUM_TRAIN = len(observations)

observations_tensor = torch.from_numpy(observations).float()
actions_tensor = torch.from_numpy(actions).float()
dataset = torch.utils.data.TensorDataset(observations_tensor, actions_tensor)
loader_train = torch.utils.data.DataLoader(dataset, batch_size=128, 
                          sampler=torch.utils.data.sampler.SubsetRandomSampler(range(NUM_TRAIN)))

In [121]:
def training_loop(loader_train, model, optimizer, epochs=1, print_every=1):
    criterion = nn.MSELoss()
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()
            scores = model(x)
            loss = criterion(scores, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        if e % print_every == 0:
            print('Epoch %d, loss = %.4f' % (e, loss.item()))
    print('Epoch %d, loss = %.4f' % (e, loss.item()))

In [101]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model = nn.Sequential(
    nn.Linear(376, 30),
    nn.ReLU(),
    nn.Linear(30, 15),
    nn.ReLU(),
    nn.Linear(15, 17)
)

model.apply(init_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [102]:
training_loop(loader_train, model, optimizer, epochs=30, print_every=5)

Epoch 0, loss = 1.2147
Epoch 5, loss = 1.1541
Epoch 10, loss = 0.8533
Epoch 15, loss = 0.8811
Epoch 20, loss = 0.7851
Epoch 25, loss = 0.8618
Epoch 29, loss = 0.6095


In [114]:
model(observations_tensor[1])

tensor([-2.9196e-01, -8.6649e-02,  2.1288e-02,  5.7818e-01, -1.5000e-01,
        -1.0519e-01,  4.7215e-01, -3.6061e-01,  8.6249e-02,  9.1089e-01,
         1.1404e+00, -1.7401e-01, -1.5177e-01,  9.9504e-04,  5.5627e-01,
         4.1024e-01, -7.2940e-02], grad_fn=<AddBackward0>)

In [117]:
torch.save(model, open("policies/humanoid_3l_fc.md", 'wb'))

In [112]:
with open("policies/humanoid_3l_fc.pkl", 'wb') as f: 
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

### Model performance:
returns [226.90052892567294, 212.3321194289658, 217.80195366576146, 204.62086051618942, 182.3108776594606, 236.57988388099307, 191.89164222695177, 215.00037708242786, 181.21875479360577, 236.3635765411728, 217.53778560875043, 221.21379086735246, 205.28428884513102, 334.3184507917499, 207.5064707678762, 219.45422690228023, 256.09821927536933, 204.7539900068401, 206.02273922400704, 207.93419224419713] <br>
mean return 219.25723646273778 <br>
std of return 31.578925930927426 <br>

## Reacher v2

### Expert performance:
returns [-4.198410075394117, -1.3188410506994994, -5.7354880391280965, -3.06706735463171, -3.620617467815022, -4.786380708719675, -1.3852393193543773, -3.3516123491724876, -1.2547292814536883, -6.505562306648722, -5.98983699591411, -3.526049717732696, -2.4442175691811974, -4.521031261323542, -6.025657346033555, -3.1225293997079513, -4.643144853035947, -6.123864040766139, -2.9946819433519587, -3.53563183633491] <br>
mean return -3.90752964581997 <br>
std of return 1.5917489917071845 <br>

In [118]:
pickle_in = open("expert_data/Reacher-v2.pkl","rb")
reacher_expert = pickle.load(pickle_in)

observations = reacher_expert['observations']

actions = reacher_expert['actions']

observations.shape, actions.shape

In [123]:
actions = actions.reshape(-1, 2)

In [154]:
NUM_TRAIN = len(observations)

observations_tensor = torch.from_numpy(observations).float()
actions_tensor = torch.from_numpy(actions).float()
dataset = torch.utils.data.TensorDataset(observations_tensor, actions_tensor)
loader_train = torch.utils.data.DataLoader(dataset, batch_size=128, 
                          sampler=torch.utils.data.sampler.SubsetRandomSampler(range(NUM_TRAIN)))

In [155]:
model = nn.Sequential(
    nn.Linear(11, 30),
    nn.ReLU(),
    nn.Linear(30, 15),
    nn.ReLU(),
    nn.Linear(15, 2)
)

model.apply(init_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [156]:
training_loop(loader_train, model, optimizer, epochs=45, print_every=5)

Epoch 0, loss = 0.1113
Epoch 5, loss = 0.0166
Epoch 10, loss = 0.0108
Epoch 15, loss = 0.0078
Epoch 20, loss = 0.0039
Epoch 25, loss = 0.0060
Epoch 30, loss = 0.0069
Epoch 35, loss = 0.0088
Epoch 40, loss = 0.0072
Epoch 44, loss = 0.0041


In [157]:
model(observations_tensor[1])

tensor([-0.1235, -0.2020], grad_fn=<AddBackward0>)

In [158]:
actions_tensor[1]

tensor([-0.2989, -0.3185])

In [159]:
torch.save(model, open("policies/reacher_3l_fc.md", 'wb'))

### Model performance:
returns [-14.307916560565811, -9.077888321317364, -13.208763647245032, -9.222286514236384, -13.518732555193447, -15.503348259528812, -7.9961926282811016, -11.179946908777184, -13.191927016184627, -12.024706431236597, -6.32562905638001, -7.256840410833916, -6.73461669557026, -8.210166523930585, -5.142029045800414, -13.92841154171759, -11.264429763848174, -9.91746506852077, -13.744988381990433, -4.8021436262973936] <br>
mean return -10.327921447872793 <br> 
std of return 3.217880999896169 <br>

### More demonstrations

### Stats for 100 demonstrations:
mean return -3.700508867917537 <br>
std of return 1.7148592869182697

In [160]:
pickle_in = open("expert_data/Reacher-v2.pkl","rb")
reacher_expert = pickle.load(pickle_in)

In [161]:
observations = reacher_expert['observations']

In [162]:
actions = reacher_expert['actions']

In [163]:
observations.shape, actions.shape

((5000, 11), (5000, 1, 2))

In [164]:
actions = actions.reshape(-1, 2)

### Train on all data

In [165]:
NUM_TRAIN = len(observations)

observations_tensor = torch.from_numpy(observations).float()
actions_tensor = torch.from_numpy(actions).float()
dataset = torch.utils.data.TensorDataset(observations_tensor, actions_tensor)
loader_train = torch.utils.data.DataLoader(dataset, batch_size=128, 
                          sampler=torch.utils.data.sampler.SubsetRandomSampler(range(NUM_TRAIN)))

In [166]:
## Change ReLU on arctang expReLU

model = nn.Sequential(
    nn.Linear(11, 30),
    nn.ReLU(),
    nn.Linear(30, 15),
    nn.ReLU(),
    nn.Linear(15, 2)
)

model.apply(init_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [167]:
training_loop(loader_train, model, optimizer, epochs=45, print_every=5)

Epoch 0, loss = 0.0081
Epoch 5, loss = 0.0060
Epoch 10, loss = 0.0010
Epoch 15, loss = 0.0006
Epoch 20, loss = 0.0010
Epoch 25, loss = 0.0007
Epoch 30, loss = 0.0002
Epoch 35, loss = 0.0008
Epoch 40, loss = 0.0001
Epoch 44, loss = 0.0008


In [168]:
model(observations_tensor[1])

tensor([0.3352, 0.2521], grad_fn=<AddBackward0>)

In [169]:
actions_tensor[1]

tensor([0.3568, 0.2442])

In [170]:
torch.save(model, open("policies/reacher_3l_fc_100demos.md", 'wb'))

mean return -6.852096058204267 <br>
std of return 3.469663395745245

In [174]:
!python run_model.py policies/reacher_3l_fc_100demos.md Reacher-v2 --num_rollouts 100

loading and building expert policy
loaded and built
  result = entry_point.load(False)
running build_ext
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
iter 0
iter 1
iter 2
iter 3
iter 4
iter 5
iter 6
iter 7
iter 8
iter 9
iter 10
iter 11
iter 12
iter 13
iter 14
iter 15
iter 16
iter 17
iter 18
iter 19
iter 20
iter 21
iter 22
iter 23
iter 24
iter 25
iter 26
iter 27
iter 28
iter 29
iter 30
iter 31
iter 32
iter 33
iter 34
iter 35
iter 36
iter 37
iter 38
iter 39
iter 40
iter 41
iter 42
iter 43
iter 44
iter 45
iter 46
iter 47
iter 48
iter 49
iter 50
iter 51
iter 52
iter 53
iter 54
iter 55
iter 56
iter 57
iter 58
iter 59
iter 60
iter 61
iter 62
iter 63
iter 64
iter 65
iter 66
iter 67
iter 68
iter 69
iter 70
iter 71
iter 72
iter 73
iter 74
iter 75
iter 76
iter 77
iter 78
iter 79
iter 80
iter 81
iter 82
iter 83
iter 84
iter 85
i