Playing ATARI games using python-neat and convolutional autoencoder

# Import

In [2]:
import os, sys, urllib, gzip
os.environ['DISPLAY']=':0'
sys.setrecursionlimit(10000)

from time import time, clock, sleep
import numpy as np
import gym
from gym import wrappers
import neat
import visualize
from __future__ import print_function
from six.moves import cPickle as pickle
import numpy as np
import cv2
import multiprocessing
from multiprocessing import Pool
from multiprocessing.reduction import reduce_connection
from hashlib import sha256
import SharedArray as sa

from functools import partial
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import Image as IPImage
from PIL import Image

# Define config

In [3]:
fc_config_filename = 'fc.config'
game_name = 'Skiing'
game_version = 'v0'
game = game_name + '-' + game_version
num_evaluations = 2
num_cores = 11
population_size = 60
encoder_filename = '../collect/{}-enc.pkl'.format(game)
encoder_outputs = 32
cuda_processes = 4

# Create environment

In [4]:
env = gym.make(game)

[2017-02-12 07:49:18,028] Making new env: Skiing-v0


# Autoencoder

In [5]:
def get_ae(encoder_filename):
    from nolearn.lasagne import NeuralNet
    ae = pickle.load(open(encoder_filename,'rb'))
    return(ae)

In [6]:
def get_f_dense(ae):
    import theano
    from lasagne.layers import get_output
    dense_layer = get_output(ae.layers_['encode'],deterministic=True)
    input_var = ae.layers_['input0'].input_var
    f_dense = theano.function([input_var], dense_layer)
    return(f_dense)

In [7]:
def autoencoder(f_dense, picture):
    return(f_dense(np.array([picture], dtype=np.float32))[0])

In [8]:
f_dense = None
get_ae_fn = partial(get_ae, encoder_filename = encoder_filename)
#ae = get_ae_fn()

# Cuda processes

In [9]:
def compress_pipe(p):
    pp = pickle.dumps(reduce_connection(p))
    return(pp)
def decompress_pipe(pp):
    upw = pickle.loads(pp)
    pp = upw[0](upw[1][0],upw[1][1],upw[1][2])
    return(pp)

In [10]:
m = multiprocessing.Manager()
cuda_q = m.Queue()
cuda_r = []

In [11]:
def cuda_process():
    print("importing...")
    global f_dense
    ae = get_ae_fn()
    f_dense = get_f_dense(ae)
    
    print("Listening")
    global cuda_q
    while True:
        [sn_frame, sn_features, p] = cuda_q.get(block = True)
        fr = sa.attach(sn_frame)
        ft = sa.attach(sn_features)
        p = decompress_pipe(p)
        
        try:
            ft[:] = autoencoder(f_dense, fr)
        except:
            ft[:] = np.zeros(encoder_outputs)

        try:
            p.send(' ')
        except:
            continue
    return(0)

In [12]:
while len(cuda_r) < cuda_processes:
    cuda_process_p = multiprocessing.Process(target = cuda_process, args = ())
    cuda_process_p.daemon = True
    cuda_process_p.start()
    cuda_r.append(cuda_process_p)
    sleep(1)

importing...
importing...
importing...
importing...


Using gpu device 1: GeForce GTX 980 (CNMeM is disabled, cuDNN 5105)
INFO (theano.gof.compilelock): Waiting for existing lock by process '15273' (I am process '15291')
[2017-02-12 07:49:39,326] Waiting for existing lock by process '15273' (I am process '15291')
INFO (theano.gof.compilelock): To manually release the lock, delete /home/etoestja/.theano/compiledir_Linux-3.16--amd64-x86_64-with-debian-8.6--2.7.9-64/lock_dir
[2017-02-12 07:49:39,333] To manually release the lock, delete /home/etoestja/.theano/compiledir_Linux-3.16--amd64-x86_64-with-debian-8.6--2.7.9-64/lock_dir
INFO (theano.gof.compilelock): Waiting for existing lock by process '15273' (I am process '15328')
[2017-02-12 07:49:40,155] Waiting for existing lock by process '15273' (I am process '15328')
INFO (theano.gof.compilelock): To manually release the lock, delete /home/etoestja/.theano/compiledir_Linux-3.16--amd64-x86_64-with-debian-8.6--2.7.9-64/lock_dir
[2017-02-12 07:49:40,162] To manually release the lock, delete /h

Listening


INFO (theano.gof.compilelock): Waiting for existing lock by process '15291' (I am process '15328')
[2017-02-12 07:49:46,937] Waiting for existing lock by process '15291' (I am process '15328')
INFO (theano.gof.compilelock): To manually release the lock, delete /home/etoestja/.theano/compiledir_Linux-3.16--amd64-x86_64-with-debian-8.6--2.7.9-64/lock_dir
[2017-02-12 07:49:46,942] To manually release the lock, delete /home/etoestja/.theano/compiledir_Linux-3.16--amd64-x86_64-with-debian-8.6--2.7.9-64/lock_dir
Using gpu device 1: GeForce GTX 980 (CNMeM is disabled, cuDNN 5105)
Using gpu device 1: GeForce GTX 980 (CNMeM is disabled, cuDNN 5105)


Listening
Listening
Listening


In [13]:
print(cuda_r)

[<Process(Process-2, started daemon)>, <Process(Process-3, started daemon)>, <Process(Process-4, started daemon)>, <Process(Process-5, started daemon)>]


# Create neat-python population

In [14]:
# Load configuration.
config_initial = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                             neat.DefaultSpeciesSet, neat.DefaultStagnation, fc_config_filename)

config_initial.genome_config.num_inputs = encoder_outputs
config_initial.genome_config.num_outputs = env.action_space.n
config_initial.pop_size = population_size

In [15]:
game_fc_config_filename = 'fc-' + game + '.config'

config_initial.save(game_fc_config_filename)

config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation, game_fc_config_filename)

# Create the population, which is the top-level object for a NEAT run.
p = neat.Population(config)

# Add reporters

In [16]:
# Add a stdout reporter to show progress in the terminal.
p.add_reporter(neat.StdOutReporter())
stats = neat.StatisticsReporter()
p.add_reporter(stats)
p.add_reporter(neat.Checkpointer(5))

# Define fitness via game score

In [17]:
alpha = 0.6
sz = 28
f_dense = None

shared_name_frame = None
shared_name_features = None
shared_array_frame = None
shared_array_features = None

def init_shared(obj):
    global shared_name_frame, shared_name_features, shared_array_frame, shared_array_features
    h = sha256(obj.__str__())
    seed = np.frombuffer(h.digest(), dtype='uint32')
    rstate = np.random.RandomState(seed)
    
    shared_name_frame = "shm://" + str(os.getpid()) + '_frame' + str(seed[0])
    shared_name_features = "shm://" + str(os.getpid()) + '_features' + str(seed[0])

    try:
        sa.delete(shared_name_frame[6:])
    except:
        pass
    try:
        sa.delete(shared_name_features[6:])
    except:
        pass
    
    shared_array_frame = sa.create(shared_name_frame, (3, sz, sz))
    shared_array_features = sa.create(shared_name_features, (encoder_outputs))

def transform_observation(observation):
    #return(autoencoder(f_dense, observation))
    global shared_array_frame, shared_name_frame, shared_name_features
    global cuda_q
    a, b = multiprocessing.Pipe()
    b = compress_pipe(b)
    shared_array_frame[:] = np.array(observation * 1./255)
    cuda_q.put([shared_name_frame, shared_name_features, b])
    a.recv()
    return(shared_array_features)

# a = argmax_a Q(s,a)
def predict_action(observation, network):
    observation = transform_observation(observation)
    output = network.activate(observation)
    action = np.argmax(output)
    return(action)

# play num_evaluations games, take mean
def evaluate_network(env, network):
    rewards = []
    i = 0
    while i < num_evaluations:
        rewards += [get_reward(env, network)]
        i += 1

    res = np.array(rewards).mean()
    return res

In [18]:
delta = None

# play 1 game with network
def get_reward(env, network):
    global sz, alpha, delta
    observation = env.reset()
    
    observation = cv2.resize(observation, (sz, sz))
    prev_observation = observation
    delta = np.zeros((3, sz, sz))
    
    done = False
    iteration, total_reward = 0, 0
        
    while not done:
        action = predict_action(delta, network)
        prev_observation = observation
        observation, reward, done, info = env.step(action)
        observation = cv2.resize(observation, (sz, sz))
        delta_now = observation - prev_observation
        delta_now = np.swapaxes(delta_now, 0, 2)
        delta_now = np.swapaxes(delta_now, 1, 2)
        delta = alpha * delta_now + (1 - alpha) * delta
        
        total_reward += reward

        #if iteration % 10 == 0:
            #plt.imshow(np.swapaxes(delta, 0, 2))
        
        #if iteration % 500 == 0:
        #    print(str(iteration))

        #if total_reward < -12000 or iteration >= 8000:
        #    break

        iteration += 1
        
    return total_reward

def evaluate_genome(genome, config):
    init_shared(genome)
    network = neat.nn.FeedForwardNetwork.create(genome, config)
    fitness = evaluate_network(env, network)
    return fitness

evaluator = neat.parallel.ParallelEvaluator(num_workers = num_cores, eval_function = evaluate_genome, timeout = None)

In [22]:
print(evaluate_genome(p.species.get_species(1).members[1], config))

-12003.0


# Run evolution

In [None]:
# Run evolution
winner = p.run(evaluator.evaluate, 33)

# Print results

In [26]:
# Display the winning genome.
#print('\nBest genome:\n{!s}'.format(winner))

# Show output of the most fit genome against training data.
winner_network = neat.nn.FeedForwardNetwork.create(winner, config)

In [None]:
visualize.draw_net(config, winner, False)
visualize.plot_stats(stats, ylog = False, view = False)
visualize.plot_species(stats, view = False)

#p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-4')
#p.run(eval_genomes, 10)

# Evaluate from checkpoint & send to OpenAI

In [19]:
def get_winner(p):
    max_fitness = -9999999999
    best_genome = None
    for v in p.population:
        genome = p.population[v]
        if genome.fitness > max_fitness:
            max_fitness = genome.fitness
            winner = genome
    print(max_fitness)
    return(winner)

In [20]:
p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-' + game)

In [21]:
winner = get_winner(p)
winner_network = neat.nn.FeedForwardNetwork.create(winner, config)

-6528.0


In [None]:
init_shared('testing')
env_eval = gym.make(game)
monitor_path = '/tmp/' + game + '-eval'
env_eval = wrappers.Monitor(env_eval, monitor_path)
def evaluate_with_video(network):
    for i_episode in range(100):
        print(get_reward(env_eval, network))

In [None]:
evaluate_with_video(winner_network)

In [None]:
env_eval.close()

In [None]:
gym.upload(monitor_path, api_key='sk_ciz2F0csRzCkpESayoRuug')

# Generate video

In [None]:
init_shared('testing')
env_eval = gym.make(game)
monitor_path = '/tmp/' + game + '-eval'
env_eval = wrappers.Monitor(env_eval, monitor_path, force = True)
env_eval.seed(0)
get_reward(env_eval, winner_network)

[2017-02-12 07:52:01,468] Making new env: Skiing-v0
[2017-02-12 07:52:01,814] Starting new video recorder writing to /tmp/Skiing-v0-eval/openaigym.video.1.15239.video000000.mp4


# Measure performance

In [None]:
# FPS total
t_initial = time()
env.reset()
F = 100
buf = []
for i in range(F):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    features = transform_observation(1)
t_end = time()
print("FPS: " + str(1. * F / (t_end - t_initial)))

In [None]:
# FPS emulator-only
t_initial = time()
env.reset()
F = 100
buf = []
for i in range(F):
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
t_end = time()
print("FPS: " + str(1. * F / (t_end - t_initial)))