# Reinforcement Learning Practice with OpenGym

In [None]:
# traning the network

import gym
import keras
import random
import math
import numpy as np
from collections import deque

# traning parameters
n_episodes = 10
n_win_ticks = 195
max_env_steps = None

gamma = 1.0 #Discount factor
epsilon = 1.0 # Exploration (random choice to move)
epsilon_min = 0.01
epsilon_decay = 0.995
alpha = 0.01 #the learning rate
alpha_decay = 0.01

batch_size =64
monitor =False
quiet = False

# enviroment Parameter
memory = deque(maxlen=100000)
env = gym.make('CartPole-v0')
if max_env_steps is not None: env.max_episode_steps = max_env_steps
   
# Building the nerual network
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# Mode definition
model = Sequential()
model.add(Dense(24, input_dim=4, activation='relu'))
model.add(Dense(48, activation='relu'))
model.add(Dense(2, activation='relu'))
model.compile(loss='mse', optimizer=Adam(lr=alpha, decay=alpha_decay))

# define necessary funtions

def remember(state, action, reward, next_state, done):
   memory.append((state, action, reward, next_state, done))
   
def choose_action(state, epsilon):
   return env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(model.predict(state))

def get_epsilon(t):
   return max(epsilon_min, min(epsilon, 1.0 - math.log10((t+1)*epsilon_decay)))
   
def preprocess_state(state):
   return np.reshape(state, [1, 4])

def replay(batch_size, epsilon):
   x_batch, y_batch = [], []
   minibatch = random.sample(memory, min(len(memory), batch_size))
   
   for state, action, reward, next_state, done in minibatch:
       y_target = model.predict(state)
       y_target[0][action] = reward if done else reward + gamma * np.max(model.predict(next_state)[0])
       x_batch.append(state[0])
       y_batch.append(y_target[0])
       
   model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch),verbose=0)
   
   if epsilon > epsilon_min:
       epsilon *= epsilon_decay
       
# Define run function

def run():
   scores = deque(maxlen=100)
   
   for e in range(n_episodes):
       state = preprocess_state(env.reset())
       done = False
       i = 0
       while not done:
           action = choose_action(state, get_epsilon(e))
           next_state, reward, done, _ = env.step(action)
           env.render()
           next_state = preprocess_state(next_state)
           remember(state, action, reward, next_state, done)
           state = next_state
           i +=1
           
       scores.append(i)
       mean_score = np.mean(scores)
       
       if mean_score >= n_win_ticks and e >= 100:
               if not quiet: print('Ran {} episodes. solved after {} trails'.format(e, e-100))
               return e - 100
       if e% 20 == 0 and not quiet:
           print('[Episode {}] - Mean survival time over lat 20 epidoes was {} ticks.'.format(e, mean_score))
           
       replay(batch_size, get_epsilon(e))
       
   if not quiet: print('Did not solve after {} episodes'.format(e))
   return e


run()

-------------------------------------------------------------------------------------------------------------------

# Trial for understanding different things used in OpenGym for CartPole

In [14]:
import gym
import numpy as np

env = gym.make('CartPole-v0')

observation = env.reset()
totalreward = 0
for _ in range(200):
    parameters = np.random.rand(4) * 2 - 1
    action = 0 if np.matmul(parameters,observation) < 0 else 1
    observation, reward, done, info = env.step(action)
    totalreward += reward
    if done:
        break
print(totalreward)

[2019-04-25 17:11:56,585] Making new env: CartPole-v0


23.0


In [24]:
observation = env.reset()
np.matmul(parameters,observation)

-0.027835955353812211

In [1]:
import gym
import numpy as np

env = gym.make('CartPole-v0')

for i_episode in range(10):
    observation = env.reset()
    for t in range(100):
        env.render()
#         print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        
        if done:
            print("episode finish after {} timestamp".format(t+1))
            break

[2019-05-02 21:15:02,146] Making new env: CartPole-v0
  result = entry_point.load(False)


[-0.02862423  0.01238449 -0.02407212  0.04599617]
[-0.02837654 -0.18238416 -0.0231522   0.33098798]
[-0.03202422  0.01305957 -0.01653244  0.03109482]
[-0.03176303  0.20841466 -0.01591054 -0.26675808]
[-0.02759474  0.01352335 -0.0212457   0.02086437]
[-0.02732427  0.20894345 -0.02082842 -0.27844538]
[-0.0231454   0.40435624 -0.02639733 -0.57762411]
[-0.01505828  0.59983804 -0.03794981 -0.87850456]
[-0.00306152  0.79545455 -0.0555199  -1.18287263]
[ 0.01284757  0.99125124 -0.07917735 -1.49242959]
[ 0.0326726   1.18724238 -0.10902594 -1.8087491 ]
[ 0.05641745  1.38339833 -0.14520092 -2.13322619]
[ 0.08408541  1.18998353 -0.18786545 -1.88869622]
episode finish after 13 timestamp
[ 0.01937387  0.01915979 -0.01639621  0.02156771]
[ 0.01975707  0.21451299 -0.01596485 -0.27624298]
[ 0.02404733  0.01962241 -0.02148971  0.01136218]
[ 0.02443977 -0.17518486 -0.02126247  0.29718813]
[ 0.02093608 -0.36999736 -0.01531871  0.58309013]
[ 0.01353613 -0.56490139 -0.0036569   0.87090835]
[ 0.0022381  -0.

[-0.06939526  0.05091423 -0.03040429 -0.27146638]
[-0.06837697 -0.14376097 -0.03583361  0.01147397]
[-0.07125219  0.05185608 -0.03560413 -0.29229605]
[-0.07021507  0.24746711 -0.04145006 -0.59599205]
[-0.06526573  0.05294901 -0.0533699  -0.31664834]
[-0.06420675 -0.14137375 -0.05970286 -0.04126253]
[-0.06703422  0.05455126 -0.06052811 -0.35216864]
[-0.0659432  -0.13966007 -0.06757149 -0.07916953]
[-0.0687364   0.05636221 -0.06915488 -0.39238284]
[-0.06760915 -0.1377137  -0.07700253 -0.12228015]
[-0.07036343  0.05842217 -0.07944814 -0.43822854]
[-0.06919499 -0.13549066 -0.08821271 -0.17160998]
[-0.0719048   0.06077584 -0.09164491 -0.49076596]
[-0.07068928 -0.13294192 -0.10146023 -0.22831461]
[-0.07334812  0.06347256 -0.10602652 -0.55119935]
[-0.07207867 -0.13001299 -0.11705051 -0.29371423]
[-0.07467893 -0.32328851 -0.12292479 -0.04011903]
[-0.0811447  -0.51645294 -0.12372717  0.21139232]
[-0.09147376 -0.31979915 -0.11949933 -0.11761645]
[-0.09786974 -0.51302425 -0.12185165  0.13510495]


In [16]:
import gym
import numpy as np

env = gym.make('CartPole-v0')
print(env.action_space)
print(env.observation_space)

print(env.observation_space.high)
print(env.observation_space.low)

[2019-05-02 21:28:52,873] Making new env: CartPole-v0


Discrete(2)
Box(4,)
[  4.80000000e+00   3.40282347e+38   4.18879020e-01   3.40282347e+38]
[ -4.80000000e+00  -3.40282347e+38  -4.18879020e-01  -3.40282347e+38]
