In [3]:
!pip3 install gym

Collecting gym
  Downloading gym-0.18.0.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 1.0 MB/s eta 0:00:01
Collecting cloudpickle<1.7.0,>=1.2.0
  Downloading cloudpickle-1.6.0-py3-none-any.whl (23 kB)
Collecting pyglet<=1.5.0,>=1.4.0
  Downloading pyglet-1.5.0-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 3.4 MB/s eta 0:00:01
Building wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25ldone
[?25h  Created wheel for gym: filename=gym-0.18.0-py3-none-any.whl size=1656446 sha256=5ccdacc69affea81fbf11062c7418055347ac87bb40c30f10c1e4b2dc64aa8b3
  Stored in directory: /home/rohit/.cache/pip/wheels/d8/e7/68/a3f0f1b5831c9321d7523f6fd4e0d3f83f2705a1cbd5daaa79
Successfully built gym
Installing collected packages: cloudpickle, pyglet, gym
Successfully installed cloudpickle-1.6.0 gym-0.18.0 pyglet-1.5.0


In [4]:
import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()



In [None]:
import gym
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop, Adam
# from keras.optimizers import RMSprop
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from statistics import mean
import h5py

LEARNING_RATE = 1e-3
MAX_MEMORY = 1000000
BATCH_SIZE = 20
GAMMA = 0.95
EXPLORATION_DECAY = 0.995
EXPLORATION_MIN = 0.01


class ScoreEvaluator:

    def __init__(self, max_len, average_of_last_runs, model = None):
        self.max_len = max_len
        self.score_table = deque(maxlen=self.max_len)
        self.model = model
        self.average_of_last_runs = average_of_last_runs

    def store_score(self, episode, step):
        self.score_table.append([episode, step])

    def plot_evaluation(self, title = "Training"):
        print(self.model.summary()) if self.model is not None else print("Model not defined!")
        avg_score = mean(self.score_table[1])
        x = []
        y = []
        for i in range(len(self.score_table)):
            x.append(self.score_table[i][0])
            y.append(self.score_table[i][1])

        average_range = self.average_of_last_runs if self.average_of_last_runs is not None else len(x)
        plt.plot(x, y, label="score per run")
        plt.plot(x[-average_range:], [np.mean(y[-average_range:])] * len(y[-average_range:]), linestyle="--",
                 label="last " + str(average_range) + " runs average")
        title = "CartPole-v1 " + str(title)
        plt.title(title)
        plt.xlabel("Runs")
        plt.ylabel("Score")
        plt.show()

class Network:

    def __init__(self, observation_space, action_space):

        self.action_space = action_space
        self.memory = deque(maxlen=MAX_MEMORY)
        self.exploration_rate = 1.0

        self.model = Sequential()
        self.model.add(Dense(32, input_shape=(observation_space,), activation='relu'))
        self.model.add(Dense(32, activation='relu'))
        self.model.add(Dense(self.action_space, activation='linear'))
        self.model.compile(loss='mse', optimizer=Adam(lr=LEARNING_RATE))

    def add_to_memory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def take_action(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(0, self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        else:
            minibatch = random.sample(self.memory, BATCH_SIZE)
            for state, action, reward, state_next, done in minibatch:
                Q = reward
                if not done:
                    Q = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
                Q_values = self.model.predict(state)
                Q_values[0][action] = Q
                self.model.fit(state, Q_values, verbose=0)
            self.exploration_rate *= EXPLORATION_DECAY
            self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

    def get_model(self):
        return self.model


class TrainSolver:

    def __init__(self, max_episodes):
        self.max_episodes = max_episodes
        self.score_table = deque(maxlen=400)
        self.average_of_last_runs = None
        self.model = None
        self.play_episodes = 100
        env = gym.make('CartPole-v1')
        observation_space = env.observation_space.shape[0]
        action_space = env.action_space.n
        self.solver = Network(observation_space, action_space)

    def train(self):
        env = gym.make('CartPole-v1')
        observation_space = env.observation_space.shape[0]
        action_space = env.action_space.n

        print("---------------------------------")
        print("Solver starts")
        print("---------------------------------")

        self.model = self.solver.get_model()
        episode = 0
        score_eval = ScoreEvaluator(400, 50, self.model)
        while episode < self.max_episodes:

            episode += 1
            state = env.reset()
            state = np.reshape(state, [1, observation_space])
            step = 0
            while True:

                step += 1
                # env.render()
                action = self.solver.take_action(state)
                state_next, reward, done, info = env.step(action)
                if not done:
                    reward = reward
                else:
                    reward = -reward
                state_next = np.reshape(state_next, [1, observation_space])
                self.solver.add_to_memory(state, action, reward, state_next, done)
                state = state_next

                if done:
                    print("Run: " + str(episode) + ", exploration: " + str(self.solver.exploration_rate) + ", score: " + str(step))
                    # self.score_table.append([episode, step])
                    score_eval.store_score(episode, step)
                    break
                self.solver.experience_replay()
        score_eval.plot_evaluation("Training")

    def return_trained_model(self):
        return self.model

    def play(self, play_episodes=100, load_model=False, model_weights_dir=None, trained_model=None):

        self.play_episodes = play_episodes
        if load_model is not False:
            if model_weights_dir is None:
                print("Can't load specified model")
            elif trained_model is None:
                print("Please pass a valid model as a parameter")
            else:
                model = trained_model
                model.load(model_weights_dir)
        else:
            model = self.model

        env = gym.make('CartPole-v1')
        observation_space = env.observation_space.shape[0]
        action_space = env.action_space.n
        episode = 0
        score_eval = ScoreEvaluator(400, 100, model)
        while episode < self.play_episodes:

            episode += 1
            state = env.reset()
            state = np.reshape(state, [1, observation_space])
            step = 0
            while True:

                step += 1
                env.render()
                action = self.solver.take_action(state)
                state_next, reward, done, info = env.step(action)

                if not done:
                    reward = reward
                else:
                    reward = -reward
                state_next = np.reshape(state_next, [1, observation_space])
                self.solver.add_to_memory(state, action, reward, state_next, done)
                state = state_next

                if done:
                    print("Run: " + str(episode) + ", score: " + str(
                        step))
                    # self.score_table.append([episode, step])
                    score_eval.store_score(episode, step)
                    break
                self.solver.experience_replay()
        score_eval.plot_evaluation("100 Plays")

    def save_model(self):
        self.model.save('cartpole_model.h5')


if __name__ == "__main__":
    trainer = TrainSolver(150)
    trainer.train()
    trainer.play(100)
    trainer.save_model()

---------------------------------
Solver starts
---------------------------------
Run: 1, exploration: 0.990025, score: 22
Run: 2, exploration: 0.9322301194154049, score: 13
Run: 3, exploration: 0.8778091417340573, score: 13
Run: 4, exploration: 0.7744209942832988, score: 26
Run: 5, exploration: 0.7292124703704616, score: 13
Run: 6, exploration: 0.6797938283326578, score: 15
Run: 7, exploration: 0.6433260027715241, score: 12
Run: 8, exploration: 0.5937455908197752, score: 17
Run: 9, exploration: 0.5732736268885887, score: 8
Run: 10, exploration: 0.5425201222922789, score: 12
Run: 11, exploration: 0.5134164023722473, score: 12
Run: 12, exploration: 0.4858739637363176, score: 12
Run: 13, exploration: 0.4484282034609769, score: 17
Run: 14, exploration: 0.4222502236424958, score: 13
Run: 15, exploration: 0.4036245882390106, score: 10
Run: 16, exploration: 0.3877593341372176, score: 9
Run: 17, exploration: 0.37251769488706843, score: 9
Run: 18, exploration: 0.3472722151889232, score: 15
Run