In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [2]:
import gym
env = gym.make("CartPole-v1")
observation = env.reset()
for _ in range(1000):
    env.render()
    action = env.action_space.sample() # your agent here (this takes random actions)
    observation, reward, done, info = env.step(action)

    if done:
        observation = env.reset()
env.close()

In [5]:
env = gym.make("CartPole-v1")

# Hill Climb Test

In [3]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym import wrappers

In [9]:
def run_episode(env, parameters):
    observation = env.reset()
    totalreward = 0
    counter = 0
    for _ in range(200):
        env.render()
        action = 0 if np.matmul(parameters, observation) < 0 else 1
        observation, reward, done, info = env.step(action)
        totalreward += reward
        counter+= 1
        if done:
            break
        
    return totalreward

In [10]:
def train(submit):
    env = gym.make('CartPole-v0')
    if submit:
        env = wrappers.Monitor(env, '/tmp/CartPole-v0-hill-climbing', None, True)
        
    episodes_per_update = 5
    noise_scaling = 0.1 
    parameters = np.random.rand(4) * 2 - 1 # random weights between [-1, 1]
    bestreward = 0
    counter = 0
    
    for episode in range(2000):
        counter += 1
        newparams = parameters + (np.random.rand(4) * 2 - 1) * noise_scaling
        print(episode)
        reward = run_episode(env, newparams)
        if reward > bestreward:
            bestreward = reward
            parameters = newparams
            if reward == 200:
                print('Yay')
                break
    return counter
        
    

In [11]:
train(True)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157


KeyboardInterrupt: 

Because it's hill climbing, not surprised it sucks. Your parameters are set 

# Q-Learning

I follow this: https://dev.to/n1try/cartpole-with-q-learning---first-experiences-with-openai-gym

Q-learning makes a Q-table with discrete actions and state pairs. Since the observation_space is a 4 tuple of floats, we will need to discretize it. But how mnay states should we discretize it to? 

Goal: Stay alive for 200 time steps

Well, we take out x and x' because the cart probably won't leave the screen in 200 time steps.

Now we are only left with theta(angle) and theta' (angle velocity) to worry about. Theta is [-0.42, .42] while theta' is [-3.4*10<sup>38</sup>, 3.4*10<sup>38</sup>]

Q-learning uses one function to fetch the best action from the q-table and another function to update the q-table based on the last action. Rewards are 1 for every time step alive.

Interestingly, the hyperparameters: alpha (learning rate), epsilon (exploration rate) and gamma (discount factor) are interesting to choose.

In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym import wrappers
from gym import ObservationWrapper
from gym import spaces
import math

Helper code to discretize observation space:

Copied from:
https://github.com/ngc92/space-wrappers/blob/master/space_wrappers/observation_wrappers.py

In [3]:
from space_wrappers import observation_wrappers as ow

Q-learning algorithm following pseudocode from: https://towardsdatascience.com/introduction-to-various-reinforcement-learning-algorithms-i-q-learning-sarsa-dqn-ddpg-72a5e0cb6287
and mainly this dude's: https://dev.to/n1try/cartpole-with-q-learning---first-experiences-with-openai-gym  

Here's his github: https://gist.github.com/n1try/af0b8476ae4106ec098fea1dfe57f578 <br>
Here's the reasoning he followed: https://medium.com/@tuzzer/cart-pole-balancing-with-q-learning-b54c6068d947

In [3]:
def Qlearning():  
    discount = 1.0  # You don't want to discount since your goal is to survive as long as possible
    num_episodes = 1000
    buckets=(1, 1, 6, 12,)
    
    def discretize(obs):
        upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)]
        lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)]
        ratios = [(obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(obs))]
        new_obs = [int(round((buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        new_obs = [min(buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
        return tuple(new_obs)
    
    env = gym.make('CartPole-v0')
    
    # Initialize a Q-table 
    num_actions = 2
    qtable = np.zeros(buckets + (num_actions,))
    
    # Loop for every episode
    for ep in range(num_episodes):
        # Optimized epsilon
        epsilon = max(0.1, min(1, 1.0 - math.log10((ep + 1) / 25)))
        alpha = max(0.1, min(1.0, 1.0 - math.log10((ep + 1) / 25)))
        
        state = discretize(env.reset())
        done = False
        score = 0
        
        # Loop for each step of episode 
        while not done:
            if ep % 100 == 0:
                env.render()
            # Select action using epsilon Greedy policy: Either folo policy or pick a random action
            action = np.random.choice([np.argmax(qtable[state]), env.action_space.sample()], 1, p=[1-epsilon, epsilon])[0]

            # Do the new action
            observation, reward, done, info = env.step(action)
            new_state = discretize(observation)
            
            # Update Q Table
            qtable[state][action] = qtable[state][action] + alpha * (reward + discount * np.max(qtable[new_state]) - qtable[state][action])
            
            score += reward
            state = new_state
        print("Episode {}, Score: {}".format(ep, score))
    env.close()

In [5]:
Qlearning()

Episode 0, Score: 19.0
Episode 1, Score: 9.0
Episode 2, Score: 10.0
Episode 3, Score: 16.0
Episode 4, Score: 19.0
Episode 5, Score: 10.0
Episode 6, Score: 21.0
Episode 7, Score: 23.0
Episode 8, Score: 36.0
Episode 9, Score: 20.0
Episode 10, Score: 16.0
Episode 11, Score: 23.0
Episode 12, Score: 15.0
Episode 13, Score: 8.0
Episode 14, Score: 21.0
Episode 15, Score: 12.0
Episode 16, Score: 12.0
Episode 17, Score: 14.0
Episode 18, Score: 12.0
Episode 19, Score: 23.0
Episode 20, Score: 11.0
Episode 21, Score: 14.0
Episode 22, Score: 17.0
Episode 23, Score: 42.0
Episode 24, Score: 26.0
Episode 25, Score: 18.0
Episode 26, Score: 19.0
Episode 27, Score: 31.0
Episode 28, Score: 20.0
Episode 29, Score: 11.0
Episode 30, Score: 61.0
Episode 31, Score: 19.0
Episode 32, Score: 22.0
Episode 33, Score: 13.0
Episode 34, Score: 23.0
Episode 35, Score: 18.0
Episode 36, Score: 22.0
Episode 37, Score: 16.0
Episode 38, Score: 22.0
Episode 39, Score: 10.0
Episode 40, Score: 25.0
Episode 41, Score: 22.0
Epis

KeyboardInterrupt: 

In [None]:
# don't forget to do plots of the logistics and such 