In [10]:
from gameagent import Agent
import gym
from ounoise import OUNoise
import numpy as np
import matplotlib.pyplot as plt
import argparse

action_size = 1
exploration_mu = 0
exploration_theta = 0.15
exploration_sigma = 0.25
noise = OUNoise(action_size, exploration_mu, exploration_theta, exploration_sigma)

def _compute_discounted_R(R, discount_rate=.999):
    discounted_r = np.zeros_like(R, dtype=np.float32)
    running_add = 0
    for t in reversed(range(len(R))):
        running_add = running_add * discount_rate + R[t]
        discounted_r[t] = running_add
    discounted_r -= discounted_r.mean() 
    discounted_r /= discounted_r.std()

    return discounted_r

def compute_discounted_R(record,discounted_rate = 0.999):
    reward_list = [x[2] for x in record]
    reward_list = _compute_discounted_R(reward_list)
    for i in range(len(record)):
        record[i][2] = reward_list[i]
    return record

def run_process(iteration, double_mode = False, train = True, render = False,\
                train_batch_size = 128,verbose = False,reward_normalization = False,save_point = 10):
    for iterate in range(iteration):
        print('iterate : ',iterate)
        if double_mode :
            run_episode(train,render, train_batch_size, verbose, reward_normalization)
            run_episode(False,render, train_batch_size, verbose, reward_normalization)
        else:
            run_episode(train,render,train_batch_size,verbose,reward_normalization)

        if train & ((iterate+1) % save_point == 0):
            #agent.main_critic.model.save_weights("./well_trained_main_critic_"+str(iterate+1)+".h5")
            #agent.target_critic.model.save_weights("./well_trained_target_critic_"+str(iterate+1)+".h5")
            #agent.main_actor.model.save_weights("./well_trained_main_actor_"+str(iterate+1)+".h5") 
            #agent.target_actor.model.save_weights("./well_trained_target_actor_"+str(iterate+1)+".h5")
            print('saved')

def run_episode(train = True, render = False, train_batch_size = 128,verbose = False,reward_normalization = False):
    record = []
    done = False
    frame = env.reset()
    ep_reward = 0
    while done != True:
        if render:
            env.render()
        state = frame.reshape(1,-1)
        state = (state - env.observation_space.low) / \
                (env.observation_space.high - env.observation_space.low)

        action = agent.get_action(state)
        if train : 
            #action = np.clip((action*2 +(noise.sample())), -2, 2)
            action = np.clip((action*2 +(np.random.normal())), -2, 2)
        else :
            action = np.clip(action, -1,1)
        next_frame, reward, done, _ = env.step(action)
        ep_reward += reward
        record.append([state,action,ep_reward,next_frame.reshape(1,-1),done])
        #record.append([state,action,reward,next_frame.reshape(1,-1),done])
        #ep_reward += reward
        frame = next_frame
        if len(record) > 198 :
            done = True
        if verbose :
            print('state : ', state, ', action :', action, ', reward : ', reward,', done : ',done,\
                ', ep_reward : ',ep_reward)
        if done & train :
            if reward_normalization : 
                record = compute_discounted_R(record)
            list(map(lambda x : agent.memory.add(x[0],x[1],x[2],x[3],x[4]), record))
        if (len(agent.memory)>train_batch_size * 10)& train:
            print('trained_start')
            agent.train()
            print('trained_well')
    print("ep_reward:", ep_reward)
    if train:
        episode_reward_lst.append(ep_reward)
    else:
        test_episode_reward_lst.append(ep_reward)



In [11]:
environment = 'Pendulum-v0'
batch_size = 128
epochs = 100
double_mode = True
train = True
render = False
verbose = False
reward_normalization= False
save_point = 100
episode_reward_lst = []
test_episode_reward_lst = []

In [12]:
env = gym.make(environment)
agent = Agent(env.observation_space.shape[0],env.action_space.shape[0],train_batch_size = batch_size)

In [7]:
run_process(epochs,double_mode=double_mode, train=train,render = render, train_batch_size=batch_size,\
                verbose = verbose,reward_normalization=reward_normalization,save_point = save_point)

iterate :  0
ep_reward: [-951.3926]
ep_reward: [-1245.5569]
iterate :  1
ep_reward: [-1612.268]
ep_reward: [-861.0428]
iterate :  2
ep_reward: [-1467.5261]
ep_reward: [-1509.5052]
iterate :  3
ep_reward: [-1666.8035]
ep_reward: [-1447.6425]
iterate :  4
ep_reward: [-1675.472]
ep_reward: [-1328.2253]
iterate :  5
ep_reward: [-883.21375]
ep_reward: [-1067.9294]
iterate :  6
trained_start
trained_well
ep_reward: [-1486.0402]
ep_reward: [-1189.0046]
iterate :  7
ep_reward: [-1065.4954]
ep_reward: [-1330.1123]
iterate :  8
ep_reward: [-1165.6113]
ep_reward: [-1159.4467]
iterate :  9
ep_reward: [-1686.6218]
ep_reward: [-1489.4423]
iterate :  10
ep_reward: [-1494.0138]
ep_reward: [-745.04755]
iterate :  11
ep_reward: [-1699.4592]
ep_reward: [-1086.0625]
iterate :  12
trained_start
trained_well
ep_reward: [-1001.25085]
ep_reward: [-1503.9646]
iterate :  13
ep_reward: [-1641.8251]
ep_reward: [-964.12964]
iterate :  14
ep_reward: [-1611.3851]
ep_reward: [-1055.0001]
iterate :  15
ep_reward: [-12

In [31]:
env.close()

In [8]:
run_process(5,double_mode=False, train=False,render = True, train_batch_size=batch_size,\
                verbose = verbose,reward_normalization=reward_normalization,save_point = save_point)

iterate :  0
ep_reward: [-1573.197]
iterate :  1
ep_reward: [-1907.1804]
iterate :  2
ep_reward: [-1061.6415]
iterate :  3
ep_reward: [-1152.4176]
iterate :  4
ep_reward: [-1203.1372]


In [9]:
env.close()