In [27]:
import numpy as np
from PIL import ImageGrab
from PIL import Image
import cv2
import io
import time
%matplotlib inline 
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (15, 9)
import seaborn as sns

In [28]:
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
game_url = "game/dino.html"
chrome_driver_path = "../chromedriver.exe"
class Game:
    def __init__(self,custom_config=True):
        chrome_options = Options()
        chrome_options.add_argument("disable-infobars")
        self._driver = webdriver.Chrome(executable_path = chrome_driver_path,chrome_options=chrome_options)
        self._driver.set_window_position(x=-10,y=0)
        self._driver.set_window_size(200, 300)
        self._driver.get(os.path.abspath(game_url))
        if custom_config:
            self._driver.execute_script("Runner.config.ACCELERATION=0")
    def get_crashed(self):
        return self._driver.execute_script("return Runner.instance_.crashed")
    def get_playing(self):
        return self._driver.execute_script("return Runner.instance_.playing")
    def restart(self):
        return self._driver.execute_script("Runner.instance_.restart()")
    def press_up(self):
        self._driver.find_element_by_tag_name("body").send_keys(Keys.ARROW_UP)
    def press_down(self):
        self._driver.find_element_by_tag_name("body").send_keys(Keys.ARROW_DOWN)
    def get_score(self):
        score_array = self._driver.execute_script("return Runner.instance_.distanceMeter.digits")
        score = ''.join(score_array)
        return int(score)
    def pause(self):
        return self._driver.execute_script("return Runner.instance_.stop()")
    def resume(self):
        return self._driver.execute_script("return Runner.instance_.play()")
#     def grab_screen(self):
#         print("grab start")le
#         image = self._driver.get_screenshot_as_png()
# #         performance_measure.send(time.time())
#         image = np.fromstring(image, np.uint8)
# #         performance_measure.send(time.time())
#         image = cv2.imdecode(image,cv2.IMREAD_UNCHANGED) 
# #         performance_measure.send(time.time())
#         image = process_img(image)
# #         performance_measure.send(time.time())
# #         cv2.imwrite('screenshot.png',image)
#         return image
    def end(self):
        self._driver.close()

In [29]:
class DinoAgent:
    def __init__(self,game):
        self._game = game;
        self.jump();
    def is_running(self):
        return self._game.get_playing()
    def is_crashed(self):
        return self._game.get_crashed()
    def jump(self):
        self._game.press_up()
#         time.sleep(0.25)
    def duck(self):
        self._game.press_down()

In [30]:
#processing image as required
def process_img(image):
    #game is already in grey scale canvas, canny to get only edges and reduce unwanted objects(clouds)
#     image = cv2.Canny(image, threshold1 = 100, threshold2 = 200)
#     image = image[10:140,0:200] #img[y:y+h, x:x+w]
#     image = resized_image = cv2.resize(image, (80, 80)) 
    image = cv2.resize(image, (0,0), fx = 0.15, fy = 0.10)
    image = image[2:38,10:50] #img[y:y+h, x:x+w]
    image = cv2.Canny(image, threshold1 = 200, threshold2 = 200)
    return  image

In [31]:
def grab_screen(_driver = None):
    screen =  np.array(ImageGrab.grab(bbox=(0,180,400,400)))
    image = process_img(screen)
    if _driver!=None:
        image = _driver.get_screenshot_as_png()
    return image

In [32]:
# print(grab_screen().shape)
# game = Game()
# dino = DinoAgent(game)
# last_time = time.time()
# while(True):
    
# # #     print('loop took {} seconds'.format(time.time()-last_time))
# # #     last_time = time.time()
# # #     cv2.imwrite("./img_data/dino"+str(time())+".jpg",image)
# # #     dino.duck()
# #     #exit on q pres
# # #     print('{0} {1} '.format(r_t,end_t))
# # #     cv2.imshow('window',game.grab_screen())
    
#     image = grab_screen()
#     cv2.imshow('window',image)
#     print(image.shape[0])
# #     cv2.destroyAllWindows()

# # #     from matplotlib import pyplot as plt
# # #     plt.imshow(image)
# # #     plt.title('my picture')
# # #     plt.show()

# # #     grab_screen()
# #     if(dino.is_crashed()):
# #         #jumping starts the game again if dino has crashed
# # #         temp = (game.get_score())
# #         game.restart()
#     if (cv2.waitKey(25) & 0xFF == ord('q')):
#         cv2.destroyAllWindows()
#         game.end()
# #         cv2.imwrite('dino.jpg',image)
#         break

In [33]:
#game parameters
ACTIONS = 2 # possible actions: jump, do nothing
GAMMA = 0.5 # decay rate of past observations original 0.99
OBSERVATION = 1000. # timesteps to observe before training
EXPLORE = 30000 #300000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.0001 # final value of epsilon
INITIAL_EPSILON = 0.1 # starting value of epsilon
REPLAY_MEMORY = 20000 # number of previous transitions to remember
BATCH = 32 # size of minibatch
FRAME_PER_ACTION = 4
LEARNING_RATE = 1e-4

In [34]:
img_rows , img_cols = 40,20
img_channels = 4 #We stack 4 frames

In [35]:
def buildmodel():
    print("Now we build the model")
    model = Sequential()
    model.add(Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same',input_shape=(img_cols,img_rows,img_channels)))  #80*80*4
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same'))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(ACTIONS))
    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse',optimizer=adam)
    print("We finish building the model")
    return model

In [36]:
def write_log(callback, names, logs, batch_no):
    for name, value in zip(names, logs):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.simple_value = value
        summary_value.tag = name
        callback.writer.add_summary(summary, batch_no)
        callback.writer.flush()

In [37]:
def trainNetwork(model,game_state,observe=False):
    # open up a game state to communicate with emulator
    last_time = time.time()
    # store the previous observations in replay memory
    D = deque()
#     display = show_img()
#     display.__next__()
    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] =1 
    x_t, r_0, terminal = game_state.get_state(do_nothing)
    

    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    

    #In Keras, need to reshape
    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])  #1*80*80*4


    if observe :#args['mode'] == 'Run':
        OBSERVE = 999999999    #We keep observe, never train
        epsilon = FINAL_EPSILON
        print ("Now we load weight")
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse',optimizer=adam)
        print ("Weight load successfully")    
    else:                       #We go to training mode
        OBSERVE = OBSERVATION
        epsilon = INITIAL_EPSILON
        model.load_weights("model.h5")
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse',optimizer=adam)

    t = 0
    while (True):
        loss = 0
        Q_sa = 0
        action_index = 0
        r_t = 0
        a_t = np.zeros([ACTIONS])
        #choose an action epsilon greedy
        if t % FRAME_PER_ACTION == 0:
            if  random.random() <= epsilon:
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[action_index] = 1
            else:
                q = model.predict(s_t)       #input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)
                action_index = max_Q
                a_t[action_index] = 1
        #We reduced the epsilon gradually
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE #update to original asap

        #run the selected action and observed next state and reward
        x_t1, r_t, terminal = game_state.get_state(a_t)
        print('loop took {} seconds'.format(time.time()-last_time))
        last_time = time.time()
#       display.send(x_t1)
        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1
        s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

        # store the transition in D
        D.append((s_t, action_index, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        #only train if done observing
        if t > OBSERVE: 
            
            #sample a minibatch to train on
            minibatch = random.sample(D, BATCH)
            inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))   #32, 80, 80, 4
            targets = np.zeros((inputs.shape[0], ACTIONS))                         #32, 2

            #Now we do the experience replay
            for i in range(0, len(minibatch)):
                state_t = minibatch[i][0]
                action_t = minibatch[i][1]   #This is action index
                reward_t = minibatch[i][2]
                state_t1 = minibatch[i][3]
                terminal = minibatch[i][4]
                # if terminated, only equals reward

                inputs[i:i + 1] = state_t    #I saved down s_t

                targets[i] = model.predict(state_t)  # Hitting each buttom probability
                Q_sa = model.predict(state_t1)

                if terminal:
                    targets[i, action_t] = reward_t
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

            # targets2 = normalize(targets)
            loss += model.train_on_batch(inputs, targets)
            loss_df.loc[len(loss_df)] = loss
        else:
            time.sleep(0.16)
        s_t = s_t1
        t = t + 1
        
        # save progress every 10000 iterations
        if t % 1000 == 0:
            print("Now we save model")
            model.save_weights("model.h5", overwrite=True)
            with open("model.json", "w") as outfile:
                json.dump(model.to_json(), outfile)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, \
            "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
            "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)

    print("Episode finished!")
    print("************************")

In [38]:
import pandas as pd
import numpy as np
import _thread as thread
loss_df = pd.DataFrame(0, index=range(2),columns=['loss'])
scores_df = pd.DataFrame(0, index=range(2),columns=['score'])
actions_df = pd.DataFrame(0, index=range(2),columns=['action'])

In [39]:
def playGame(observe=False):
    game = Game()
    dino = DinoAgent(game)
    game_state = Game_sate(dino,game)
    model = buildmodel()
    trainNetwork(model,game_state,observe=observe)
    

In [40]:
class Game_sate:
    def __init__(self,agent,game):
        self._agent = agent
        self._game = game
        self._display = show_img()
        self._display.__next__()
    def get_state(self,actions):
        actions_df.loc[len(actions_df)] = actions[1]
        reward = 0.1
        is_over = False
        if actions[1] == 1:
            self._agent.jump()

        image = grab_screen()
        self._display.send(image)

        if self._agent.is_crashed():
            scores_df.loc[len(loss_df)] = self._game.get_score()
            self._game.restart()
            reward = -5
            is_over = True
        return image, reward, is_over

In [41]:
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD , Adam
import tensorflow as tf
from keras.callbacks import TensorBoard
from collections import deque
import random
import json

In [42]:
def show_img(graphs = False):
    """
    Show images in new window
    """
    frame = 0
    while True:
        screen = (yield)
        window_title = "logs" if graphs else "game_play"
        cv2.namedWindow(window_title, cv2.WINDOW_NORMAL)        # Create window with freedom of dimensions
        imS = cv2.resize(screen, (800, 400)) 
        cv2.imshow(window_title, imS)
#         cv2.imwrite("screenshot"+str(frame)+".png",screen)
        if (cv2.waitKey(1) & 0xFF == ord('q')):
            cv2.destroyAllWindows()
            break

        frame += 1

In [43]:
playGame(observe=False)

Now we build the model
We finish building the model


  after removing the cwd from sys.path.
  
  


loop took 0.528921365737915 seconds
TIMESTEP 1 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2015235424041748 seconds
TIMESTEP 2 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21889281272888184 seconds
TIMESTEP 3 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20416712760925293 seconds
TIMESTEP 4 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
loop took 0.23114848136901855 seconds
TIMESTEP 5 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20686578750610352 seconds
TIMESTEP 6 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2173473834991455 seconds
TIMESTEP 7 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2178940773010254 seconds
TIMESTEP 8 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 

TIMESTEP 67 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23516321182250977 seconds
TIMESTEP 68 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2497730255126953 seconds
TIMESTEP 69 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21782326698303223 seconds
TIMESTEP 70 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2191925048828125 seconds
TIMESTEP 71 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2182924747467041 seconds
TIMESTEP 72 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.24936485290527344 seconds
TIMESTEP 73 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.22009873390197754 seconds
TIMESTEP 74 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20492315292358398 seconds
TIM

loop took 0.23319244384765625 seconds
TIMESTEP 133 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2042698860168457 seconds
TIMESTEP 134 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21798229217529297 seconds
TIMESTEP 135 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.22484207153320312 seconds
TIMESTEP 136 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.1974947452545166 seconds
TIMESTEP 137 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21708965301513672 seconds
TIMESTEP 138 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2033395767211914 seconds
TIMESTEP 139 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.22130298614501953 seconds
TIMESTEP 140 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss

loop took 0.21672415733337402 seconds
TIMESTEP 198 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20342445373535156 seconds
TIMESTEP 199 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23547053337097168 seconds
TIMESTEP 200 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2333829402923584 seconds
TIMESTEP 201 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23778319358825684 seconds
TIMESTEP 202 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21991586685180664 seconds
TIMESTEP 203 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23183274269104004 seconds
TIMESTEP 204 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23504114151000977 seconds
TIMESTEP 205 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / L

loop took 0.23856329917907715 seconds
TIMESTEP 263 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.19605755805969238 seconds
TIMESTEP 264 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20166635513305664 seconds
TIMESTEP 265 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21881842613220215 seconds
TIMESTEP 266 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21895360946655273 seconds
TIMESTEP 267 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.22214269638061523 seconds
TIMESTEP 268 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
loop took 0.2475452423095703 seconds
TIMESTEP 269 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2354743480682373 seconds
TIMESTEP 270 / STATE observe / EPSILON 0.1 / ACTI

loop took 0.2318871021270752 seconds
TIMESTEP 328 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.2231900691986084 seconds
TIMESTEP 329 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2314753532409668 seconds
TIMESTEP 330 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.25142383575439453 seconds
TIMESTEP 331 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2513082027435303 seconds
TIMESTEP 332 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.26317548751831055 seconds
TIMESTEP 333 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23604106903076172 seconds
TIMESTEP 334 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2647411823272705 seconds
TIMESTEP 335 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  

TIMESTEP 393 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21916604042053223 seconds
TIMESTEP 394 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21849775314331055 seconds
TIMESTEP 395 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21849894523620605 seconds
TIMESTEP 396 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23361945152282715 seconds
TIMESTEP 397 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2177448272705078 seconds
TIMESTEP 398 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21950507164001465 seconds
TIMESTEP 399 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2034761905670166 seconds
TIMESTEP 400 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.24947309494018555 se

TIMESTEP 458 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2490699291229248 seconds
TIMESTEP 459 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21901726722717285 seconds
TIMESTEP 460 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.28332972526550293 seconds
TIMESTEP 461 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.23046875 seconds
TIMESTEP 462 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.22621488571166992 seconds
TIMESTEP 463 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.19612503051757812 seconds
TIMESTEP 464 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2346489429473877 seconds
TIMESTEP 465 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20431876182556152 seconds
TIME

TIMESTEP 524 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2591571807861328 seconds
TIMESTEP 525 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.194624662399292 seconds
TIMESTEP 526 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20312190055847168 seconds
TIMESTEP 527 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20576786994934082 seconds
TIMESTEP 528 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2501084804534912 seconds
TIMESTEP 529 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21741890907287598 seconds
TIMESTEP 530 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21760940551757812 seconds
TIMESTEP 531 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2572808265686035 second

TIMESTEP 589 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23335051536560059 seconds
TIMESTEP 590 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2351834774017334 seconds
TIMESTEP 591 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23565101623535156 seconds
TIMESTEP 592 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23406338691711426 seconds
TIMESTEP 593 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2403697967529297 seconds
TIMESTEP 594 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.22789621353149414 seconds
TIMESTEP 595 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2169952392578125 seconds
TIMESTEP 596 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.26680779457092285 seco

loop took 0.20528268814086914 seconds
TIMESTEP 656 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2460646629333496 seconds
TIMESTEP 657 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21790790557861328 seconds
TIMESTEP 658 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.22212696075439453 seconds
TIMESTEP 659 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21791362762451172 seconds
TIMESTEP 660 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2908642292022705 seconds
TIMESTEP 661 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2572183609008789 seconds
TIMESTEP 662 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.251117467880249 seconds
TIMESTEP 663 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss 

TIMESTEP 722 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23315954208374023 seconds
TIMESTEP 723 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23410797119140625 seconds
TIMESTEP 724 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.25027894973754883 seconds
TIMESTEP 725 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21018481254577637 seconds
TIMESTEP 726 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.21307802200317383 seconds
TIMESTEP 727 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21625256538391113 seconds
TIMESTEP 728 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.24996376037597656 seconds
TIMESTEP 729 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21904706954956055 s

TIMESTEP 788 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
loop took 0.24945974349975586 seconds
TIMESTEP 789 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.235396146774292 seconds
TIMESTEP 790 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.25898194313049316 seconds
TIMESTEP 791 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.20926976203918457 seconds
TIMESTEP 792 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21705293655395508 seconds
TIMESTEP 793 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21875619888305664 seconds
TIMESTEP 794 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2173776626586914 seconds
TIMESTEP 795 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0


TIMESTEP 853 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2331852912902832 seconds
TIMESTEP 854 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.25181007385253906 seconds
TIMESTEP 855 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2481989860534668 seconds
TIMESTEP 856 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
----------Random Action----------
loop took 0.2656080722808838 seconds
TIMESTEP 857 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.251129150390625 seconds
TIMESTEP 858 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2496170997619629 seconds
TIMESTEP 859 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.24971938133239746 seconds
TIMESTEP 860 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
lo

TIMESTEP 918 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21873784065246582 seconds
TIMESTEP 919 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2577028274536133 seconds
TIMESTEP 920 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss  0
----------Random Action----------
loop took 0.22740769386291504 seconds
TIMESTEP 921 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23282527923583984 seconds
TIMESTEP 922 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20664429664611816 seconds
TIMESTEP 923 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2180478572845459 seconds
TIMESTEP 924 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.25026392936706543 seconds
TIMESTEP 925 / STATE observe / EPSILON 0.1 / ACTION 1 / REWARD 0.1 / Q_MAX  0 / Loss  0

loop took 0.20743846893310547 seconds
TIMESTEP 984 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss  0
loop took 0.21325254440307617 seconds
TIMESTEP 985 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.205031156539917 seconds
TIMESTEP 986 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.20256900787353516 seconds
TIMESTEP 987 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.21909403800964355 seconds
TIMESTEP 988 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.23433232307434082 seconds
TIMESTEP 989 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.2190237045288086 seconds
TIMESTEP 990 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD 0.1 / Q_MAX  0 / Loss  0
loop took 0.22684741020202637 seconds
TIMESTEP 991 / STATE observe / EPSILON 0.1 / ACTION 0 / REWARD -5 / Q_MAX  0 / Loss 

TIMESTEP 1038 / STATE explore / EPSILON 0.09987679000000016 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0538316 / Loss  1.31039810181
loop took 0.18572568893432617 seconds
TIMESTEP 1039 / STATE explore / EPSILON 0.09987346000000016 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.887471 / Loss  1.73652029037
loop took 0.18751144409179688 seconds
TIMESTEP 1040 / STATE explore / EPSILON 0.09987013000000017 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.116558 / Loss  0.812381207943
loop took 0.23668980598449707 seconds
TIMESTEP 1041 / STATE explore / EPSILON 0.09986680000000017 / ACTION 1 / REWARD 0.1 / Q_MAX  11.3542 / Loss  1.78666353226
loop took 0.1955556869506836 seconds
TIMESTEP 1042 / STATE explore / EPSILON 0.09986347000000018 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0946717 / Loss  1.83013272285
loop took 0.2105703353881836 seconds
TIMESTEP 1043 / STATE explore / EPSILON 0.09986014000000018 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0842245 / Loss  1.72645068169
loop took 0.21320056915283203 seconds
TIMESTEP 1044 / STATE exp

TIMESTEP 1089 / STATE explore / EPSILON 0.09970696000000037 / ACTION 1 / REWARD 0.1 / Q_MAX  -1.05458 / Loss  0.632514417171
loop took 0.2225954532623291 seconds
TIMESTEP 1090 / STATE explore / EPSILON 0.09970363000000038 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0976732 / Loss  1.27667570114
loop took 0.20958256721496582 seconds
TIMESTEP 1091 / STATE explore / EPSILON 0.09970030000000038 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0345826 / Loss  1.31695103645
loop took 0.17237520217895508 seconds
TIMESTEP 1092 / STATE explore / EPSILON 0.09969697000000038 / ACTION 0 / REWARD 0.1 / Q_MAX  0.129585 / Loss  1.44227087498
loop took 0.23358798027038574 seconds
TIMESTEP 1093 / STATE explore / EPSILON 0.09969364000000039 / ACTION 1 / REWARD 0.1 / Q_MAX  0.00154126 / Loss  0.376496076584
loop took 0.17824149131774902 seconds
TIMESTEP 1094 / STATE explore / EPSILON 0.09969031000000039 / ACTION 0 / REWARD 0.1 / Q_MAX  -1.21187 / Loss  1.00244510174
loop took 0.19451546669006348 seconds
TIMESTEP 1095 / STATE e

loop took 0.21256113052368164 seconds
TIMESTEP 1141 / STATE explore / EPSILON 0.09953380000000059 / ACTION 1 / REWARD 0.1 / Q_MAX  0.141112 / Loss  0.995165288448
loop took 0.1634671688079834 seconds
TIMESTEP 1142 / STATE explore / EPSILON 0.09953047000000059 / ACTION 0 / REWARD 0.1 / Q_MAX  0.116651 / Loss  0.418673753738
loop took 0.21355581283569336 seconds
TIMESTEP 1143 / STATE explore / EPSILON 0.0995271400000006 / ACTION 0 / REWARD 0.1 / Q_MAX  0.00328374 / Loss  0.74675154686
loop took 0.1694502830505371 seconds
TIMESTEP 1144 / STATE explore / EPSILON 0.0995238100000006 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.407874 / Loss  0.836525559425
loop took 0.21984219551086426 seconds
TIMESTEP 1145 / STATE explore / EPSILON 0.0995204800000006 / ACTION 1 / REWARD 0.1 / Q_MAX  0.124849 / Loss  0.765896558762
loop took 0.19273805618286133 seconds
TIMESTEP 1146 / STATE explore / EPSILON 0.09951715000000061 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.00740445 / Loss  0.830469846725
loop took 0.1633701324

TIMESTEP 1191 / STATE explore / EPSILON 0.0993673000000008 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.267485 / Loss  0.939486145973
loop took 0.23257780075073242 seconds
TIMESTEP 1192 / STATE explore / EPSILON 0.0993639700000008 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0267119 / Loss  1.15816259384
loop took 0.26747846603393555 seconds
TIMESTEP 1193 / STATE explore / EPSILON 0.0993606400000008 / ACTION 1 / REWARD 0.1 / Q_MAX  3.45658 / Loss  1.26925039291
loop took 0.22898221015930176 seconds
TIMESTEP 1194 / STATE explore / EPSILON 0.09935731000000081 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.155236 / Loss  1.65209376812
loop took 0.18941855430603027 seconds
TIMESTEP 1195 / STATE explore / EPSILON 0.09935398000000081 / ACTION 0 / REWARD 0.1 / Q_MAX  -1.37433 / Loss  0.687177658081
loop took 0.18248844146728516 seconds
TIMESTEP 1196 / STATE explore / EPSILON 0.09935065000000082 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.316525 / Loss  0.958893418312
loop took 0.2055950164794922 seconds
TIMESTEP 1197 / STATE expl

loop took 0.16543841361999512 seconds
TIMESTEP 1242 / STATE explore / EPSILON 0.09919747000000101 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.0118001 / Loss  0.388026684523
loop took 0.17949771881103516 seconds
TIMESTEP 1243 / STATE explore / EPSILON 0.09919414000000101 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0550981 / Loss  0.878653049469
loop took 0.19852375984191895 seconds
TIMESTEP 1244 / STATE explore / EPSILON 0.09919081000000102 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.105178 / Loss  0.597231924534
loop took 0.20261454582214355 seconds
TIMESTEP 1245 / STATE explore / EPSILON 0.09918748000000102 / ACTION 1 / REWARD 0.1 / Q_MAX  0.104681 / Loss  0.485595524311
loop took 0.20254135131835938 seconds
TIMESTEP 1246 / STATE explore / EPSILON 0.09918415000000103 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.0351887 / Loss  1.00806665421
loop took 0.17751717567443848 seconds
TIMESTEP 1247 / STATE explore / EPSILON 0.09918082000000103 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.0362622 / Loss  0.546875655651
loop took 0.244

loop took 0.2046356201171875 seconds
TIMESTEP 1293 / STATE explore / EPSILON 0.09902764000000122 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.775263 / Loss  0.775719761848
loop took 0.17585277557373047 seconds
TIMESTEP 1294 / STATE explore / EPSILON 0.09902431000000123 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0305114 / Loss  0.623941421509
loop took 0.2552504539489746 seconds
TIMESTEP 1295 / STATE explore / EPSILON 0.09902098000000123 / ACTION 0 / REWARD -5 / Q_MAX  -0.940842 / Loss  0.957114100456
loop took 0.1926279067993164 seconds
TIMESTEP 1296 / STATE explore / EPSILON 0.09901765000000123 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.272675 / Loss  0.7735902071
loop took 0.1859138011932373 seconds
TIMESTEP 1297 / STATE explore / EPSILON 0.09901432000000124 / ACTION 1 / REWARD 0.1 / Q_MAX  -0.739846 / Loss  0.588051497936
loop took 0.2038590908050537 seconds
TIMESTEP 1298 / STATE explore / EPSILON 0.09901099000000124 / ACTION 0 / REWARD 0.1 / Q_MAX  -1.73462 / Loss  0.80109000206
loop took 0.19552159309387

TIMESTEP 1344 / STATE explore / EPSILON 0.09885781000000143 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.360325 / Loss  0.992933750153
loop took 0.21155786514282227 seconds
TIMESTEP 1345 / STATE explore / EPSILON 0.09885448000000144 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.327261 / Loss  0.69052708149
loop took 0.20460867881774902 seconds
TIMESTEP 1346 / STATE explore / EPSILON 0.09885115000000144 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.678958 / Loss  1.4820690155
loop took 0.19188213348388672 seconds
TIMESTEP 1347 / STATE explore / EPSILON 0.09884782000000145 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.414862 / Loss  0.293455868959
loop took 0.19561433792114258 seconds
TIMESTEP 1348 / STATE explore / EPSILON 0.09884449000000145 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.254567 / Loss  0.697984099388
----------Random Action----------
loop took 0.20313358306884766 seconds
TIMESTEP 1349 / STATE explore / EPSILON 0.09884116000000145 / ACTION 0 / REWARD -5 / Q_MAX  -0.715428 / Loss  0.840419054031
loop took 0.154433012008

TIMESTEP 1394 / STATE explore / EPSILON 0.09869131000000164 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0899048 / Loss  0.593283832073
loop took 0.17631864547729492 seconds
TIMESTEP 1395 / STATE explore / EPSILON 0.09868798000000165 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.51634 / Loss  0.805286169052
loop took 0.19651103019714355 seconds
TIMESTEP 1396 / STATE explore / EPSILON 0.09868465000000165 / ACTION 0 / REWARD 0.1 / Q_MAX  0.196959 / Loss  1.9811412096
loop took 0.20754027366638184 seconds
TIMESTEP 1397 / STATE explore / EPSILON 0.09868132000000165 / ACTION 1 / REWARD 0.1 / Q_MAX  -0.477685 / Loss  0.775254011154
loop took 0.20341229438781738 seconds
TIMESTEP 1398 / STATE explore / EPSILON 0.09867799000000166 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0302527 / Loss  1.70806241035
loop took 0.18552160263061523 seconds
TIMESTEP 1399 / STATE explore / EPSILON 0.09867466000000166 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.145645 / Loss  0.461912512779
loop took 0.2225027084350586 seconds
TIMESTEP 1400 / STATE 

loop took 0.18726515769958496 seconds
TIMESTEP 1445 / STATE explore / EPSILON 0.09852148000000185 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0481777 / Loss  0.720978736877
loop took 0.18725347518920898 seconds
TIMESTEP 1446 / STATE explore / EPSILON 0.09851815000000186 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.242328 / Loss  0.87198984623
loop took 0.1694493293762207 seconds
TIMESTEP 1447 / STATE explore / EPSILON 0.09851482000000186 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.48376 / Loss  0.582257628441
loop took 0.1761949062347412 seconds
TIMESTEP 1448 / STATE explore / EPSILON 0.09851149000000187 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.7884 / Loss  0.304948508739
loop took 0.19859623908996582 seconds
TIMESTEP 1449 / STATE explore / EPSILON 0.09850816000000187 / ACTION 1 / REWARD 0.1 / Q_MAX  -0.281362 / Loss  1.19298815727
loop took 0.1714801788330078 seconds
TIMESTEP 1450 / STATE explore / EPSILON 0.09850483000000188 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.0261157 / Loss  1.13314199448
loop took 0.191745519638

TIMESTEP 1496 / STATE explore / EPSILON 0.09835165000000207 / ACTION 0 / REWARD -5 / Q_MAX  0.0567913 / Loss  0.856238365173
loop took 0.22234463691711426 seconds
TIMESTEP 1497 / STATE explore / EPSILON 0.09834832000000207 / ACTION 0 / REWARD 0.1 / Q_MAX  -4.55814 / Loss  0.667400240898
loop took 0.20333456993103027 seconds
TIMESTEP 1498 / STATE explore / EPSILON 0.09834499000000207 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.256724 / Loss  0.884163975716
loop took 0.19154644012451172 seconds
TIMESTEP 1499 / STATE explore / EPSILON 0.09834166000000208 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0230429 / Loss  0.411145865917
loop took 0.1825113296508789 seconds
TIMESTEP 1500 / STATE explore / EPSILON 0.09833833000000208 / ACTION 0 / REWARD 0.1 / Q_MAX  0.0153309 / Loss  0.509860754013
loop took 0.19177699089050293 seconds
TIMESTEP 1501 / STATE explore / EPSILON 0.09833500000000209 / ACTION 1 / REWARD 0.1 / Q_MAX  -0.85703 / Loss  0.225203543901
loop took 0.2165687084197998 seconds
TIMESTEP 1502 / STATE

TIMESTEP 1546 / STATE explore / EPSILON 0.09818515000000227 / ACTION 0 / REWARD 0.1 / Q_MAX  -0.403539 / Loss  1.13104116917


StopIteration: 

In [None]:


def show_plots():
    fig, axs = plt.subplots(ncols=2,nrows =2)
    loss_df['loss'] = loss_df['loss'].astype('float') 
    loss_df.plot(use_index=True,ax=axs[0,0])
    scores_df.plot(ax=axs[0,1])
    sns.distplot(actions_df,ax=axs[1,0])
    fig.canvas.draw()
    graph_img = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
    graph_img = graph_img.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    disp = show_img(graphs=True)
    disp.__next__()
#     while True:
#         disp.send(graph_img)

In [None]:
show_plots()