In [1]:
import random
import gym
import math
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [30]:
class CartPole():
    def __init__(self, episodes = 1000, monitor = False, epsilon = .1, goal_ticks = 500, passing_pretrain = 150):
        self.env = gym.make('CartPole-v0')
        if monitor: 
            self.env = gym.wrappers.Monitor(self.env, '../data/cartpole-1', force=True)
        self.epsilon = epsilon
        self.episodes = episodes
        self.monitor = monitor
        self.memory = []
        self.goal_ticks = goal_ticks
        self.passing_pretrain = passing_pretrain
        
        
        self.model = None
        
        
    def model_create(self, input_size):
        
        
        self.model = Sequential()
        self.model.add(Dense(32, input_shape = input_size,  activation='relu'))
        self.model.add(Dense(64, activation = 'relu'))
        self.model.add(Dense(128, activation = 'relu'))
        self.model.add(Dense(256, activation = 'relu'))
        self.model.add(Dense(512, activation = 'relu'))

        self.model.add(Dense(2, activation = 'softmax'))
        self.model.compile(loss='mse', optimizer=Adam())
        
        
    
    def choose_action(self, state, initial = True):
        if self.model is None:
            return ((np.sign((state.sum())) + 1) // 2).astype('int')
        else:
            return np.argmax(self.model.predict(np.array(state)[np.newaxis, ...]))

    
    def preprocess_state(self, state):
        return np.reshape(np.array(state), [1, 4])
    
    
    
    def train(self):
        X_train = [i[0] for i in self.memory]
        y_train = [i[1] for i in self.memory]
        
        if self.model is None:
            self.model_create([len(X_train[0])])
        
        X_train = np.vstack(X_train)
        self.model.fit(X_train, np.array(y_train), epochs = 3, verbose=100)
        self.passing_pretrain = int(self.passing_pretrain + 10)
        self.memory.clear()
        print('new_epoch')
        
            
        
        
    
    def play(self):
        rewards = []
        for episode in range(self.episodes):
            state = self.preprocess_state(self.env.reset())     
            state = state[0]
            scores = []
            over = False
            ticks = 0
            
            game_memory = []
            game_reward = 0
            
            while not over:
                if self.monitor:
                    self.env.render()
                action = self.choose_action(state, )
                next_state, reward, over, _ = self.env.step(action)
                if action == 0:
                    action = [1, 0]
                elif action == 1:
                    action = [0, 1]
                game_memory.append([state, action])
                state = next_state
                ticks += 1
                game_reward += reward
                
            if game_reward >= self.passing_pretrain:
#                 print(game_reward)
                self.memory.extend(game_memory)
            rewards.append(game_reward)
                
            if len(self.memory) >= 500:
#                 print(self.memory)
#                 raise Error
                print(np.mean(rewards))
                if np.mean(rewards) >= 180:
                    self.monitor = True
                rewards.clear()
                self.train()
            
            
            scores.append(ticks)            
            if np.mean(ticks) >= 500:
                print('Hooray')
                
            else: 
                pass
        self.env.close()
                


In [None]:
cp = CartPole(monitor=False)
cp.play()

141.22222222222223
Epoch 1/3
Epoch 2/3
Epoch 3/3
new_epoch
134.6818181818182
Epoch 1/3
Epoch 2/3
Epoch 3/3
new_epoch
130.33333333333334
Epoch 1/3
Epoch 2/3
Epoch 3/3
new_epoch
143.83333333333334
Epoch 1/3
Epoch 2/3
Epoch 3/3
new_epoch
192.75
Epoch 1/3
Epoch 2/3
Epoch 3/3
new_epoch
199.5
Epoch 1/3
Epoch 2/3
Epoch 3/3
new_epoch


In [88]:
X_train = [i[0] for i in cp.memory]
y_train = [i[1] for i in cp.memory]

In [78]:
y_train

[[0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 

In [45]:
for item in X_train:
    print(len(item))

1
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
1
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4


In [49]:
X_train[0]

array([[ 0.0096328 ,  0.04360004, -0.01462325, -0.01136133]])

In [89]:
model = Sequential()
model.add(Dense(32, input_shape = [len(X_train[0])],  activation='relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(2, activation = 'linear'))
model.compile(loss='mse', optimizer=Adam())

In [90]:
X_train = np.vstack(X_train)

model.fit(X_train, np.array(y_train) )

Epoch 1/1


<keras.callbacks.callbacks.History at 0x1f637d4bc88>

In [84]:
y_train

[[0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 [0, 1],
 [0, 1],
 [1, 0],
 [0, 1],
 [1, 0],
 

In [99]:
model.predict(np.array([-0.04336696,  0.01654524, -0.03846717, -0.03514139])[np.newaxis, ...])

array([[0.0677907 , 0.05918679]], dtype=float32)