In [None]:
import gym
import time
import numpy as np
import matplotlib.pyplot as plt

env = gym.make("MountainCar-v0")

# print(env.observation_space.high)
# print(env.observation_space.low)
# print(env.action_space.n)

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 2000
SHOW_EVERY = 500
EPSILON = 0.5
START_EPSILON_DECAYING = 1
END_EPISION_DECAYING = EPISODES // 2

epsilon_decay_value = EPSILON/(END_EPISION_DECAYING - START_EPSILON_DECAYING)

#Q-table logic
DISCRETE_OS_SIZE = [20]*len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high-env.observation_space.low)/DISCRETE_OS_SIZE
#print(discrete_os_win_size)

q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE+[env.action_space.n]))
# print(q_table.shape)

ep_rewards = []
aggr_ep_reward = {"ep": [], "avg": [], "min":[], "max":[]}


def get_discrete_state(state):
    discrete_state = (state-env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))


for episode in range(EPISODES):
    episode_reward = 0
    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False
    
    discrete_state = get_discrete_state(env.reset())
    # print(discrete_state)
    # print(np.argmax(q_table[discrete_state]))

    done = False 
    while not done:
        if np.random.random() > EPSILON:
            action = np.argmax(q_table[discrete_state])
        else:
            action = np.random.randint(0, env.action_space.n)
        state_prime, reward, done, _ = env.step(action)
        episode_reward += reward
        #print(reward,state_prime)
        new_discrete_state = get_discrete_state(state_prime)
        if render:
            env.render()
        if not done:
            max_feature_q = np.max(q_table[new_discrete_state])
            current_q = q_table[discrete_state + (action, )]

            new_q = (1-LEARNING_RATE)*current_q+LEARNING_RATE*(reward+DISCOUNT*max_feature_q)
            q_table[discrete_state+(action, )] = new_q

        elif state_prime[0] >= env.goal_position:
            q_table[discrete_state+(action, )] = 0

        discrete_state = new_discrete_state
    if END_EPISION_DECAYING >= episode >= START_EPSILON_DECAYING:
        EPSILON -= epsilon_decay_value
    ep_rewards.append(episode_reward)
    
    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        aggr_ep_reward["ep"].append(episode)
        aggr_ep_reward["avg"].append(average_reward)
        aggr_ep_reward["min"].append(min(ep_rewards[-SHOW_EVERY:]))
        aggr_ep_reward["max"].append(max(ep_rewards[-SHOW_EVERY:]))
        
        print(f"Episode:{episode} avg: {average_reward} min: {min(ep_rewards[-SHOW_EVERY:])} max: {max(ep_rewards[-SHOW_EVERY:])} state: {state_prime}")
              
    env.close()
              
plt.plot(aggr_ep_reward["ep"], aggr_ep_reward['avg'], label="avg")             
plt.plot(aggr_ep_reward["ep"], aggr_ep_reward['min'], label="min")             
plt.plot(aggr_ep_reward["ep"], aggr_ep_reward['max'], label="max")
plt.legend(loc=4)
plt.show()

0
Episode:0 avg: -200.0 min: -200.0 max: -200.0 state: [-0.56780628 -0.01296347]


## CARTPOLE 

In [4]:
import numpy as np
import time, gym
import matplotlib.pyplot as plt
import tqdm 
import torch 
import torch.nn as nn


In [5]:
env = gym.make("CartPole-v0")
env.seed()
print(env.observation_space)
print(env.action_space.n)

class create_cartpole(nn.Module):
    def __init__(self):
        super(create_cartpole, self).__init__()
        model  = [nn.Linear(32),
                 nn.ReLU()]
        self.a = nn.Sequential(*model)
        
    def forward(self, x):
        return self.a
    


Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
2


In [1]:
import gym
env = gym.make("Pendulum-v0")
done = False
print(env.observation_space.high)
print(env.observation_space.low)
print(env.action_space)
number = 0
while not done:
    env.reset()
    number += 1
    action = (1,)
    state_prime, reward, done, _ = env.step(action)
    env.render()
    done = True
    print(state_prime, reward, number)

[1. 1. 8.]
[-1. -1. -8.]
Box(-2.0, 2.0, (1,), float32)
[-0.50689903 -0.86200544 -0.64531948] -4.288245488572728 1
