# Modifying Rewards

I'm a bit disappointed that the genetic algorithm tends to sway too far from the center. If I had a light penalty, will it avoid this.

Reading [the wiki](https://github.com/openai/gym/wiki/CartPole-v0) helps a lot in figuring this out! That page has the parameters for cart-pole v0, which is just a shorter version of the cart-pole v1 we use.

In [1]:
# Display GIFs in Jupyter
from IPython.display import HTML

# OpenAI gym
import gym

# Import local script
import agents

# numpy
import numpy as np

# To speed up the algorithm
from multiprocessing import Pool
n_jobs = 4 # Set your number of cores here

Here I add a penalty of `abs(observation[0])`. Observation 0 is the distance from the center, within `[-2.6, 2.6]`.

In [2]:
def trial_agent(agent, trials=100, limit=1000):
    env = gym.make(agent.game)

    scores = []
    for i in range(trials):
        observation = env.reset()
        score = 0
        for t in range(limit):
            action = agent.predict(observation)
            observation, reward, done, info = env.step(action)
            if done:
                break
            # Add a light penalty for distance
            score += reward - abs(observation[0]/10)
        scores.append(score)
        
    data_dict = {
        "agent" : agent, 
        "weights" : agent.w, 
        "pedigree" : agent.pedigree, 
        "minimum" : min(scores), 
        "maximum" : max(scores), 
        "mean" : sum(scores)/len(scores)
    }
    
    env.close()
    
    return data_dict

In [3]:
def genetic_algorithm(results, old=5, new=95, n_parents=2, generations=25, mutation_rate=0.01, mutation_amount=0.5, order=1, max_score=499.0, cartpole=True):
    for round in range(generations):
        # Sort agents by score (fitness)
        top_scores = sorted(results, key=lambda x: x["mean"], reverse=True)

        # The survival of the fittest. Wikipedia calls this "elitism".
        # The top agents of a generation are carried over to the next
        survivors = top_scores[:old]

        # To start breeding new agents, I'll mix weights (genes)
        gene_pool = [i["weights"] for i in top_scores]
        pedigree_list = [i["pedigree"] for i in top_scores]
        genome_size = len(gene_pool[0])

        # Scores can be negative, so here I make them all positive
        # They also need to sum to 1 for random sampling
        min_score = min([i["mean"] for i in top_scores])
        sum_score = sum([i["mean"]+min_score for i in top_scores])
        probs = [(i["mean"]+min_score)/sum_score for i in top_scores]

        # For each new agent, randomly select parents
        # Higher-fitness agents are likelier to sire new agents
        children = []
        for birth in range(new):
            parents = np.random.choice(np.arange(len(gene_pool)), 
                             size=n_parents, 
                             replace=False, 
                             p=probs)

            # The offspring get a mix of each parent's weights
            # The weights (genes) are simply copied over
            mix = np.random.randint(0, high=n_parents, size=genome_size)

            weights = []
            pedigree = []
            for i in range(genome_size):
                weights.append(gene_pool[parents[mix[i]]][i])
                pedigree.append(pedigree_list[parents[mix[i]]][i])
                # A mutation happens rarely and adds a bit of noise to a gene
                if np.random.random(1) < mutation_rate:
                    weights[i] += float(np.random.normal(0, mutation_amount, 1))
                    pedigree[i] += "M"

            children.append({"weights" : weights, "pedigree" : pedigree})

        # Elitism: the top agents survive to fight another day
        new_agents = [i["agent"] for i in survivors]

        # The offspring are added it
        # With the pedigree variable their ancestors are tracked
        for child in children:
            new_agents.append(agents.LinearAgent(child["weights"], 
                pedigree=child["pedigree"],
                order=order,
                cartpole=cartpole))

        # Trial the agents using multiple CPU cores
        p = Pool(n_jobs)
        results = p.map(trial_agent, new_agents)
        p.close()
        
        results = sorted(results, key=lambda x: x["mean"], reverse=True)

        print(f"[{round+1:3}] Population average: {sum([i['mean'] for i in results])/len(results):5.1f}")
        print(f"[{round+1:3}] Best mean score:    {results[0]['mean']:5.1f}, Pedigree: {'-'.join(results[0]['pedigree'])}")
        print()
        
        # End early if maximum is reached
        if results[0]['mean'] >= max_score:
            print(f"[{round+1:3}] Best score reached, ending early")
            break
    return results

## Extreme third-order agent

I'm going to try a third-order agent. And this time I'll take a peek at results every few generation.

In [4]:
results = []

for a in range(25):
    results.append(trial_agent(agents.LinearAgent(None, order=5, id=a)))

winner = sorted(results, key=lambda x: x["mean"], reverse=True)[0]

print(winner)

HTML(f"<img src='{winner['agent'].render('mod_cart_0.gif', episodes=1)}'>")

{'agent': <agents.LinearAgent object at 0x7ff76d92be10>, 'weights': array([ 0.01094896, -0.45413246, -0.43642856,  0.67977911,  0.14533926,
        0.06389506, -0.12374572,  0.73809348, -0.2168805 , -0.78608922,
       -0.00992948, -0.8739999 ,  0.84260605,  0.29482506,  0.18337614,
        0.31788015, -0.69486694,  0.76971151, -0.28259501, -0.99907324,
       -0.15805739]), 'pedigree': ['22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22', '22'], 'minimum': 33.81001627834163, 'maximum': 181.52994022624586, 'mean': 74.71578066817105}


In [5]:
results = genetic_algorithm(results, generations=5, order=5)

winner = sorted(results, key=lambda x: x["mean"], reverse=True)[0]

print(winner)

HTML(f"<img src='{winner['agent'].render('mod_cart_5.gif', episodes=1)}'>")

[  1] Population average:  24.6
[  1] Best mean score:    135.2, Pedigree: 22-6-6-6-6-22-6-22-22-6-22-6-6-6-6-6-6-6-22-22-22

[  2] Population average:  54.2
[  2] Best mean score:    417.5, Pedigree: 22-6-21-21-21-22-6-6-6-6-22-6-21-6-6-6-6-6-22-22-6

[  3] Population average: 114.5
[  3] Best mean score:    419.6, Pedigree: 22-6-21-1-21M-22-6-22-6-6-22-1-1-1-6-6-6-1-22-22-1

[  4] Population average: 188.1
[  4] Best mean score:    428.8, Pedigree: 22-6-21-21-21-22-21-21-21-6-22-12-21-6-21-6-21-6-12-22-10

[  5] Population average: 247.9
[  5] Best mean score:    422.2, Pedigree: 22-8-21-22-21-21-21-21-8-6-8-22-20-21-21-8-22-1-6-1M-1

{'agent': <agents.LinearAgent object at 0x7ff768f32828>, 'weights': [0.010948962265880269, -0.05240362794528752, 0.08920413429256313, 0.6797791137616385, 0.5616888590346676, -0.5017146986289274, -0.31144111412878717, -0.9408170651648431, -0.7189290530501449, -0.3974992856089661, -0.7737071128675692, -0.87399990190714, 0.2894879885647068, -0.079895720808

In [6]:
results = genetic_algorithm(results, generations=5, order=5)

winner = sorted(results, key=lambda x: x["mean"], reverse=True)[0]

print(winner)

HTML(f"<img src='{winner['agent'].render('mod_cart_10.gif', episodes=1)}'>")

[  1] Population average: 274.7
[  1] Best mean score:    426.4, Pedigree: 22-6-21-1-6M-22-6-22-21-6-22-6-21-1-6-6-6M-1-22-22-1

[  2] Population average: 313.4
[  2] Best mean score:    470.4, Pedigree: 22M-6-21-22-1-8-6-21-21M-21-1-22-21-6-6-21-6-1-21-22-21

[  3] Population average: 348.2
[  3] Best mean score:    476.4, Pedigree: 22M-6-21-22-1-8-6-21-21M-21-1-22-21-6-6-21-6-1-21-22-21

[  4] Population average: 354.5
[  4] Best mean score:    480.6, Pedigree: 22M-6-21-22-1-8-6-21-21M-21-1-22-21-6-6-21-6-1-21-22-21

[  5] Population average: 355.9
[  5] Best mean score:    475.1, Pedigree: 22M-6-21-22-1-8-6-21-21M-21-1-22-21-6-6-21-6-1-21-22-21

{'agent': <agents.LinearAgent object at 0x7ff768ea28d0>, 'weights': [-0.038420072021086625, -0.09894921999799955, 0.08920413429256313, 0.6797791137616385, 0.23848436970449804, 0.21380969198457422, 0.3645687543167686, -0.9408170651648431, 1.2472383690551632, 0.8149320933540891, 0.13858602234176098, -0.87399990190714, -0.622040503387544, 0.623

In [7]:
results = genetic_algorithm(results, generations=5, order=5)

winner = sorted(results, key=lambda x: x["mean"], reverse=True)[0]

print(winner)

HTML(f"<img src='{winner['agent'].render('mod_cart_15.gif', episodes=1)}'>")

[  1] Population average: 348.9
[  1] Best mean score:    476.0, Pedigree: 22M-6-21-21-6-22-6-21-16-21-22-22-21-6-6-21-22-1-22-22-1

[  2] Population average: 367.3
[  2] Best mean score:    476.4, Pedigree: 22M-6-21-22-6-22-6-21-16-21-22-22-21-6-6-21-22-1-21-22-1

[  3] Population average: 370.2
[  3] Best mean score:    474.3, Pedigree: 22M-6-21-22-1-8-6-21-21M-21-1-22-21-6-6-21-6-1-21-22-21

[  4] Population average: 368.9
[  4] Best mean score:    483.0, Pedigree: 22M-6-21-21-6-6-6-21-16-21-20-22-21-6-6-6-6-1-22-22-21

[  5] Population average: 378.7
[  5] Best mean score:    482.5, Pedigree: 22M-8-21-1-6M-6-1-6-21MMM-21-22-21-0-6-6-6M-6-1-21M-22-16

{'agent': <agents.LinearAgent object at 0x7ff768ec69b0>, 'weights': [-0.038420072021086625, -0.05240362794528752, 0.08920413429256313, 0.5286551489552509, 0.3776596885234489, 0.045363378289105416, 0.7148876400735882, 0.17784020387707478, 2.8041611464509892, 0.8149320933540891, -0.009929481156090159, 0.2162495409357419, -0.2822813625426

In [8]:
results = genetic_algorithm(results, generations=5, order=5)

winner = sorted(results, key=lambda x: x["mean"], reverse=True)[0]

print(winner)

HTML(f"<img src='{winner['agent'].render('mod_cart_20.gif', episodes=1)}'>")

[  1] Population average: 377.4
[  1] Best mean score:    484.9, Pedigree: 22M-6-21-22-6MM-22-21-6-21MMM-21-22-1-21-1-21M-21-22-6-22-1-1

[  2] Population average: 387.7
[  2] Best mean score:    484.1, Pedigree: 22M-6-21-21-21-22-1-6-21MMM-6-22-1-22-22-22-6-22-1-22-6-1

[  3] Population average: 397.8
[  3] Best mean score:    483.4, Pedigree: 22M-6-21-21-21-22-1-6-21MMM-6-22-1-22-22-22-6-22-1-22-6-1

[  4] Population average: 392.6
[  4] Best mean score:    484.1, Pedigree: 22M-6-21-21-21-22-1-6-21MMM-6-22-1-22-22-22-6-22-1-22-6-1

[  5] Population average: 379.0
[  5] Best mean score:    485.1, Pedigree: 22M-6-21-21-21-22-1-6-21MMM-6-22-1-22-22-22-6-22-1-22-6-1

{'agent': <agents.LinearAgent object at 0x7ff768e5ac18>, 'weights': [-0.038420072021086625, -0.09894921999799955, 0.08920413429256313, 0.795518223459416, 0.5616888590346676, 0.0638950568255372, 0.7148876400735882, 0.17784020387707478, 2.8041611464509892, -0.3974992856089661, -0.009929481156090159, 0.32852424110954703, 0.8426

In [9]:
results = genetic_algorithm(results, generations=5, order=5)

winner = sorted(results, key=lambda x: x["mean"], reverse=True)[0]

print(winner)

HTML(f"<img src='{winner['agent'].render('mod_cart_25.gif', episodes=1)}'>")

[  1] Population average: 395.3
[  1] Best mean score:    484.2, Pedigree: 22M-6-21-21-21-22-1-6-21MMM-6-22-22M-21-22-22-6-22-6-22-6-1

[  2] Population average: 387.4
[  2] Best mean score:    484.3, Pedigree: 22M-6-21-21-21-22-1-6-21MMM-6-22-1-22-22-22-6-22-1-22-6-1

[  3] Population average: 390.2
[  3] Best mean score:    486.0, Pedigree: 22M-8-21-21-21-22-1-6-21MMM-6-8-1-21-20M-6M-6-22-1-22-22-1

[  4] Population average: 394.2
[  4] Best mean score:    485.1, Pedigree: 22M-6-21-21-21-22-1-6-21MMM-6-22-22MM-0-22-22-6-22-1-22-21-6

[  5] Population average: 408.9
[  5] Best mean score:    485.0, Pedigree: 22M-8-21-21-1-22-1-6-21M-6-21M-1-0M-22-6-6-6-1M-22-6-1

{'agent': <agents.LinearAgent object at 0x7ff768e8f2b0>, 'weights': [-0.038420072021086625, -0.05240362794528752, 0.08920413429256313, 0.795518223459416, 0.23848436970449804, 0.0638950568255372, 0.7148876400735882, 0.17784020387707478, 1.2472383690551632, -0.3974992856089661, -0.33054836357113626, 0.32852424110954703, -0.5703