In [None]:
%cd ..
import gym
import gym_game
import matplotlib.pyplot as plt

env = gym.make("Parkme")
reset, _ = env.reset()
n_actions = env.action_space.n
states_dim = env.observation_space.shape[0]
print(n_actions, states_dim)

In [None]:
from sklearn.neural_network import MLPClassifier
import pickle


def load_agent(path=None):
    if path is None:
        agent = MLPClassifier(
            hidden_layer_sizes=(128),
            activation="logistic",
        )
        return agent.partial_fit(
            [reset] * n_actions, range(n_actions), range(n_actions)
        )
    with open(path, "rb") as model:
        return pickle.load(model)


agent = load_agent()

In [None]:
import numpy as np
import random


def generate_session(env, agent, exploration_rate=0.8):
    states, actions = [], []
    total_reward = 0
    try:
        s, _ = env.reset()
    except IndexError:
        env.close()
        del env
        env = gym.make("Parkme")
        s, _ = env.reset()
    for _ in range(3000):
        if random.random() > exploration_rate:
            action = np.random.choice(list(range(n_actions)))
        else:
            probs = agent.predict_proba([s])[0]
            action = np.random.choice(list(range(n_actions)), p=probs)
        new_s, r, terminated, _, _ = env.step(action)
        states.append(s)
        actions.append(action)
        total_reward += r

        s = new_s
        if terminated:
            break
    return states, actions, total_reward

In [None]:
def select_elites(states_batch, actions_batch, rewards_batch, percentile):
    reward_threshold = np.percentile(rewards_batch, percentile)
    elite_states = []
    elite_actions = []
    for i in range(len(states_batch)):
        if rewards_batch[i] >= reward_threshold:
            elite_states.extend(states_batch[i])
            elite_actions.extend(actions_batch[i])

    return elite_states, elite_actions

In [None]:
from IPython.display import clear_output


def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):
    mean_reward = np.mean(rewards_batch)
    threshold = np.percentile(rewards_batch, percentile)
    log.append([mean_reward, threshold])
    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f" % (mean_reward, threshold))
    plt.figure(figsize=[8, 4])
    plt.subplot(1, 2, 1)
    plt.plot(list(zip(*log))[0], label="Mean rewards")
    plt.plot(list(zip(*log))[1], label="Reward thresholds")
    plt.legend()
    plt.grid()
    plt.subplot(1, 2, 2)
    plt.hist(rewards_batch, range=reward_range)
    plt.vlines(
        [np.percentile(rewards_batch, percentile)],
        [0],
        [100],
        label="percentile",
        color="red",
    )
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
import pickle

with open("models_bin/CEM.pkl", "wb") as model:
    pickle.dump(agent, model)

In [None]:
percentile = 85
log = []
n_epochs = 80
best_reward = -1e12
worst_reward = 1e12

In [None]:
%%time
from tqdm import tqdm
from joblib import parallel_backend

for i in tqdm(range(n_epochs)):
    sessions = [generate_session(env, agent) for _ in range(100)]
    states_batch, actions_batch, rewards_batch = zip(*sessions)
    elite_states, elite_actions = select_elites(
        states_batch, actions_batch, rewards_batch, percentile
    )
    # random_indices = np.random.choice(len(extra_elit_actions), 5000, replace=False)
    # elite_states.extend(extra_elit_states[random_indices])
    # elite_actions.extend(extra_elit_actions[random_indices])
    with parallel_backend("loky", n_jobs=-1):
        agent = agent.partial_fit(elite_states, elite_actions)
        agent = agent.partial_fit(extra_elit_states, extra_elit_actions)
    mean = np.mean(rewards_batch)
    if mean > best_reward:
        with open("models_bin/CEM.pkl", "wb") as model:
            pickle.dump(agent, model)
        best_reward = mean
    worst_reward = min(worst_reward, mean)
    show_progress(
        rewards_batch, log, percentile, reward_range=[worst_reward, best_reward]
    )

In [None]:
print(best_reward)