Playing ATARI-ram games using python-neat

# Import

In [2]:
import numpy as np
import gym
from gym import wrappers
from __future__ import print_function
import os
%matplotlib inline
import neat
import visualize
os.environ['DISPLAY']=':0'

# Define config

In [3]:
fc_config_filename = 'fc.config'
game = 'MsPacman-ram-v0'
num_evaluations = 3
num_cores = 10
population_size = 60

# Create environment

In [4]:
env = gym.make(game)

[2017-02-08 17:59:35,339] Making new env: MsPacman-ram-v0


# Create neat-python population

In [5]:
# Load configuration.
config_initial = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation, fc_config_filename)

config_initial.genome_config.num_inputs = env.observation_space.shape[0]
config_initial.genome_config.num_outputs = env.action_space.n
config_initial.pop_size = population_size

In [6]:
game_fc_config_filename = 'fc-' + game + '.config'

config_initial.save(game_fc_config_filename)

config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation, game_fc_config_filename)

# Create the population, which is the top-level object for a NEAT run.
p = neat.Population(config)

# Add reporters

In [7]:
# Add a stdout reporter to show progress in the terminal.
p.add_reporter(neat.StdOutReporter())
stats = neat.StatisticsReporter()
p.add_reporter(stats)
p.add_reporter(neat.Checkpointer(5))

# Define fitness via game score

In [8]:
def transform_observation(observation):
    observation = observation / 255. * 2 - 1
    return observation

# a = argmax_a Q(s,a)
def predict_action(observation, network):
    observation = transform_observation(observation)
    output = network.activate(observation)
    action = np.argmax(output)
    return(action)

# play num_evaluations games, take mean
def evaluate_network(env, network):
    rewards = []
    i = 0
    while i < num_evaluations:
        rewards += [get_reward(env, network)]
        i += 1

    res = np.array(rewards).mean()
    return res

# play 1 game with network
def get_reward(env, network):
    observation = env.reset()
    done = False
    iteration, total_reward = 0, 0
        
    while not done:
        #env.render()
        action = predict_action(observation, network)
        observation, reward, done, info = env.step(action)
        total_reward += reward

        #if iteration % 500 == 0:
        #    print(str(iteration))

        #if total_reward < -12000 or iteration >= 8000:
        #    break

        iteration += 1
        
    return total_reward

def evaluate_genome(genome, config):
    network = neat.nn.FeedForwardNetwork.create(genome, config)
    fitness = evaluate_network(env, network)
    return fitness
    
#print(evaluate_genome(p.species.get_species(1).members[1], config))

evaluator = neat.parallel.ParallelEvaluator(num_workers = num_cores, eval_function = evaluate_genome, timeout = None)

# Run evolution

In [None]:
# Run evolution
winner = p.run(evaluator.evaluate, 200)


 ****** Running generation 200 ****** 

Population's average fitness: 763.21637 stdev: 505.88581
Best fitness: 1760.00000 - size: (10, 6) - species 79 - id 1520
Species length: 19 totaling 57 individuals
Species no improv: {21: 58, 56: 40, 59: 39, 60: 36, 64: 52, 65: 10, 67: 35, 68: 56, 69: 51, 70: 34, 71: 23, 72: 28, 73: 27, 74: 21, 75: 16, 76: 23, 77: 17, 78: 9, 79: 0}
Average adjusted fitness: 0.414
Spawn amounts: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
Species fitness  : [0.49477124183006527, 0.5725490196078431, 0.45424836601307195, 0.0, 0.25751633986928102, 0.44509803921568625, 0.40784313725490201, 0.68496732026143781, 0.42222222222222217, 0.27516339869281042, 0.40849673202614373, 0.39869281045751637, 0.0, 0.62679738562091514, 0.46601307189542479, 0.14575163398692811, 0.59411764705882353, 0.55555555555555547, 0.64967320261437911]
Mean genetic distance 2.1703075175, std dev 0.502056699817
Total extinctions: 0
Generation time: 42.563 sec (45.883 average)

 ****** 

In [None]:
winner = p.run(evaluator.evaluate, 2000)


 ****** Running generation 400 ****** 

Population's average fitness: 1020.50505 stdev: 561.70113
Best fitness: 1820.00000 - size: (10, 7) - species 100 - id 3108
Species length: 21 totaling 66 individuals
Species no improv: {91: 45, 96: 24, 100: 11, 102: 38, 103: 49, 104: 57, 105: 56, 106: 23, 107: 60, 108: 57, 109: 42, 112: 31, 113: 15, 114: 26, 115: 1, 116: 22, 118: 5, 119: 1, 120: 3, 121: 0, 122: 0}

Species 107 with 3 members is stagnated: removing it
Average adjusted fitness: 0.538
Spawn amounts: [3, 3, 5, 3, 4, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 3, 3]
Species fitness  : [0.71338383838383834, 0.79797979797979801, 0.81818181818181823, 0.5757575757575758, 0.64725378787878796, 0.75441919191919182, 0.66792929292929282, 0.61237373737373724, 0.085227272727272721, 0.056818181818181816, 0.42108585858585851, 0.0, 0.58459595959595956, 0.72222222222222221, 0.75568181818181823, 0.69823232323232309, 0.23611111111111113, 0.40214646464646459, 0.55239898989898983, 0.65404040404040398]
Mean 

# Print results

In [11]:
# Display the winning genome.
#print('\nBest genome:\n{!s}'.format(winner))

# Show output of the most fit genome against training data.
winner_network = neat.nn.FeedForwardNetwork.create(winner, config)
visualize.draw_net(config, winner, False)
visualize.plot_stats(stats, ylog = False, view = False)
visualize.plot_species(stats, view = False)

#p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-4')
#p.run(eval_genomes, 10)

ValueError: max() arg is an empty sequence

# Evaluate from checkpoint & send to OpenAI

In [10]:
p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-1244')
p.run(evaluator.evaluate, 1)
winner = p.best_genome

Mean genetic distance 2.44789332906, std dev 0.572318014762


In [43]:
env_eval = gym.make(game)
monitor_path = '/tmp/' + game + '-eval'
env_eval = wrappers.Monitor(env_eval, monitor_path)
def evaluate_with_video(game, network):
    for i_episode in range(100):
        observation = env_eval.reset()
        total_reward = 0
        t = 0
        while True:
            env_eval.render()
            action = predict_action(observation, network)
            observation, reward, done, info = env_eval.step(action)
            total_reward += reward
            t += 1
            if done:
                print("Episode finished after {0} timesteps reward = {1}".format(t+1, total_reward))
                break

In [None]:
evaluate_with_video(game, winner_network)

In [None]:
env_eval.close()

In [None]:
gym.upload(monitor_path, api_key='sk_ciz2F0csRzCkpESayoRuug')