In [4]:
import sys
from random import randrange
from PIL import Image
import matplotlib.pyplot as plt
from Game import Game
from copy import deepcopy
from QLearning import *
import numpy as np
import matplotlib.pyplot as plt
import csv

In [1]:
def main(display=False, seed=123, dist_func='euclid', exp_func='eps-greedy', num_episodes=15, csv_name='training_data'):
    # Initialize game and learner
    game = Game(display=display, random_seed=seed)
    learner = QLearning(game, dist_func=dist_func, exp_func=exp_func)

    # Display run parameters
    print("""Running game with:
                        Seed: {}
                        Distance Function: {}
                        Exploration Function: {}
                        # of Episodes: {}
                        CSV Filename: {}
                """.format(seed, dist_func, exp_func, num_episodes,csv_name))

    # Initialize data structure for CSV and list of minimal actions
    training_data = []
    minimal_actions = game.ale.getMinimalActionSet()
    minimal_actions.pop(1)

    # Start training
    for episode in range(num_episodes):
        # Initialize reward
        total_reward = 0
        count = 0 

        game.initialize()
        while not game.is_over():
            # Get current state q_values and grad_theta_q values
            curr_state_q = learner.q_func(game)[1]
            curr_state_fevals = np.array(learner.get_distances(game))
            
            # Get action based on exploration strategy
            if learner.exp_func == "eps-greedy" and np.random.random() < learner.eps:
                best_action = learner.get_eps_greedy_action()
            elif learner.exp_func == "softmax":
                best_action = learner.get_softmax_action()
            else:
                best_action = learner.get_max_q_action()

#             print("Before:")
#             plot(game)
            
            # Execute action and update weights based on reward
            reward = game.ale.act(best_action[0])   
            game.update_RAM()
            reward += game.get_reward()
            
            learner.update_weights(curr_state_q, curr_state_fevals, best_action, reward)
            
#             print("After action {}:".format(str(best_action[0])))
#             plot(game)
#             print("Reward: %d" % reward)
#             print("Qbert Pos: {}".format(game.player.pos))
#             print ("RAM : {}".format(game.RAM))
            
            total_reward += reward
            count += 1
            
        plot(game)
        print(learner.weights)
        print("Episode %d ended with score: %d" % (episode, total_reward))
        
        # Append data to array for CSV writing
        final_values = [episode, total_reward] + list(learner.weights)
        training_data.append(final_values)
        
        game.reset()
        
    return game

In [3]:
def plot(game):
    game.ale.getScreenRGB(game.screen)
    plt.imshow(game.screen)
    plt.show()

In [None]:
g = main(num_episodes=1000)

In [2]:
block_states = [[0], 
                [0,0],
                [0,0,0],
                [0,0,0,0],
                [0,0,0,0,0],
                [0,0,0,0,0,0]]

BLOCK_POS = [[(76,35)], 
            [(64,63),(92,63)],
            [(53,92),(77,92),(104,92)],
            [(40,121),(64,121),(92,121),(117,121)],
            [(29,150),(52,150),(76,150),(105,150),(128,150)],
            [(16,179),(40,179),(64,179),(93,179),(116,179),(140,179)]]

DISC_POS = [(15,138), (144,138)]

# block_states = [[0], 
#                 [1,1],
#                 [1,1,1],
#                 [1,1,1,1],
#                 [1,1,1,1,1],
#                 [1,1,1,1,1,1]]

In [17]:
q_pos = np.array((76,35))
disc_pos = np.array((140,179))
# disc_pos = np.array(DISC_POS[0])
np.sum(np.fabs(disc_pos - q_pos))

208.0

In [5]:
# Max distance of blocks
dist = 0
player_pos = (76, 35)
row_num = 0
for row in block_states:
    block_num = 0            
    for block_state in row:
        if block_state == 0:
            q_pos = np.array(player_pos)
            block_pos = np.array(BLOCK_POS[row_num][block_num])
            dist += np.sum(np.fabs(block_pos - q_pos))
        block_num += 1
    row_num += 1

print(dist)

2576.0


In [None]:
np.floor(2 ** 3.2)