In [1]:
import argparse
import gym
import numpy as np
import os
import pickle
import torch

from td3 import TD3Agent
from main import train, test

In [4]:
num_train_episodes = 3000
num_memory_fill_episodes = 10
num_test_episodes = 100
memory_capacity = 10000
update_freq = 2
batchsize = 64
discount = 0.99
tau = 0.005
policy_noise_std = 0.2
policy_noise_clip = 0.5
exploration_noise = 0.1
actor_lr = 1e-3
critic_lr = 1e-3
results_folder = None
env_name = 'LunarLanderContinuous-v2'
train_mode = True
cuda_device = 'cuda:0'
train_seed = 12321
test_seed = [456, 12,985234, 123, 3202]

In [9]:
device = torch.device(cuda_device if torch.cuda.is_available() else "cpu")

In [10]:
if results_folder is None:
    results_folder = 'results/{}_disc{}_actorlr{}_criticlr{}_tau{}_noisestd{}_noiseclip{}_expl{}_d{}'.format(env_name, discount, actor_lr, critic_lr, tau, policy_noise_std, policy_noise_clip, exploration_noise, update_freq)

if not os.path.exists('results'):
    os.mkdir('results')

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

In [11]:
if train_mode:
    os.environ['PYTHONHASHSEED']=str(train_seed)
    np.random.seed(train_seed)
    torch.manual_seed(train_seed)

    env = gym.make(env_name)
    env.seed(train_seed)
    env.action_space.np_random.seed(train_seed)
    
    td3_agent = TD3Agent(state_dim=env.observation_space.shape[0], 
                                action_dim=env.action_space.shape[0],
                                max_action=env.action_space.high[0], # clamp only works with numbers, not with arrays 
                                device=device, 
                                memory_capacity=memory_capacity, 
                                discount=discount, 
                                update_freq=update_freq, 
                                tau=tau, 
                                policy_noise_std=policy_noise_std, 
                                policy_noise_clip=policy_noise_clip, 
                                actor_lr=actor_lr, 
                                critic_lr=critic_lr, 
                                train_mode=train_mode)

    train(env=env, 
                td3_agent=td3_agent, 
                epochs_train=num_train_episodes, 
                epochs_fill_memory=num_memory_fill_episodes, 
                batchsize=batchsize, 
                exploration_noise=exploration_noise,
                results_folder=results_folder)

    env.close()

Memory filled:  1190
Ep: 0 | Ep reward: -180.31215696136587 | Moving avg: -180.31215696136587
Ep: 1 | Ep reward: -291.6474859133323 | Moving avg: -235.97982143734907


KeyboardInterrupt: 

In [14]:
def run_test(seed):
    print('=== TEST SEED: {} ==='.format(seed))
    os.environ['PYTHONHASHSEED']=str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    env = gym.make(env_name)
    env.seed(seed)

    td3_agent = TD3Agent(state_dim=env.observation_space.shape[0], 
                                    action_dim=env.action_space.shape[0],
                                    max_action=env.action_space.high[0], # clamp only works with numbers, not with arrays 
                                    device=device,  
                                    train_mode='False')
    td3_agent.load(path=results_folder, model_name='best')

    test(env=env,
                td3_agent=td3_agent,
                epochs_test=num_test_episodes,
                seed=seed,
                results_folder=results_folder)

    env.close()

In [15]:
run_test(test_seed[0])

=== TEST SEED: 456 ===
Ep: 0 | Ep reward: 106
Ep: 1 | Ep reward: 93
Ep: 2 | Ep reward: 151
Ep: 3 | Ep reward: 179
Ep: 4 | Ep reward: 110
Ep: 5 | Ep reward: 141
Ep: 6 | Ep reward: 82
Ep: 7 | Ep reward: 174
Ep: 8 | Ep reward: 247
Ep: 9 | Ep reward: 124
Ep: 10 | Ep reward: 140
Ep: 11 | Ep reward: 92
Ep: 12 | Ep reward: 83
Ep: 13 | Ep reward: 94
Ep: 14 | Ep reward: 140
Ep: 15 | Ep reward: 152
Ep: 16 | Ep reward: 157
Ep: 17 | Ep reward: 172
Ep: 18 | Ep reward: 125
Ep: 19 | Ep reward: 121
Ep: 20 | Ep reward: 99
Ep: 21 | Ep reward: 138
Ep: 22 | Ep reward: 99
Ep: 23 | Ep reward: 110
Ep: 24 | Ep reward: 93
Ep: 25 | Ep reward: 125
Ep: 26 | Ep reward: 134
Ep: 27 | Ep reward: 113
Ep: 28 | Ep reward: 187
Ep: 29 | Ep reward: 162
Ep: 30 | Ep reward: 223
Ep: 31 | Ep reward: 157
Ep: 32 | Ep reward: 130
Ep: 33 | Ep reward: 227
Ep: 34 | Ep reward: 136
Ep: 35 | Ep reward: 83
Ep: 36 | Ep reward: 549
Ep: 37 | Ep reward: 140
Ep: 38 | Ep reward: 113
Ep: 39 | Ep reward: 162
Ep: 40 | Ep reward: 108
Ep: 41 | Ep 