In [10]:
# Evolutionary Agent for LunarLander-v3 using Gymnasium (Improved Version)

!pip install gymnasium[box2d] pygame pyvirtualdisplay > /dev/null 2>&1

import gymnasium as gym
from gymnasium.wrappers import RecordVideo

import numpy as np
import matplotlib.pyplot as plt
import random
import time
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display



In [11]:
# Виртуальный дисплей для рендера в Colab
display = Display(visible=0, size=(1400, 900))
display.start()

def show_video():
    mp4list = glob.glob('video/*.mp4')
    if mp4list:
        with open(mp4list[0], 'rb') as f:
            encoded = base64.b64encode(f.read())
        return HTML(data=f"<video autoplay loop controls style='height: 400px;'>\n"
                         f"<source src='data:video/mp4;base64,{encoded.decode('ascii')}' type='video/mp4' />\n"
                         "</video>")
    else:
        print("Видео не найдено.")



In [12]:
def wrap_env(env):
    return RecordVideo(env, './video')

def normalize_obs(obs):
    return obs / (np.linalg.norm(obs) + 1e-8)

def countValue(popul, env, normalize=True, n_repeat=3):
    reward_list = []
    for bot in popul:
        rewards = []
        for _ in range(n_repeat):
            observation, _ = env.reset()
            done = False
            total_reward = 0
            action = 0
            while not done:
                observation, reward, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                total_reward += reward
                obs = normalize_obs(observation) if normalize else observation
                result = np.dot(obs, bot)
                action = np.argmax(result)
            rewards.append(total_reward)
        reward_list.append(np.mean(rewards))
    return reward_list

def getSurvPopul(popul, val, nsurv, reverse=True):
    indexed = sorted(enumerate(val), key=lambda x: x[1], reverse=reverse)
    elite_indexes = [i for i, _ in indexed[:nsurv]]
    newpopul = [popul[i] for i in elite_indexes]
    sorted_vals = [val[i] for i in elite_indexes]
    return newpopul, sorted_vals

def getParents(curr_popul, nsurv):
    return random.choice(curr_popul[:nsurv]), random.choice(curr_popul[:nsurv])

def crossPointFrom2Parents(botp1, botp2, idx):
    return botp1[idx] if random.random() < 0.5 else botp2[idx]



In [13]:
# === Инициализация ===
env = wrap_env(gym.make('LunarLander-v3', render_mode="rgb_array"))
numBots = 40
popul = [np.random.random((8, 4)) for _ in range(numBots)]

# === Параметры эволюции ===
nsurv = 20
nnew = numBots - nsurv
epochs = 100
mut = 0.4
eph_change_mut = [30, 60, 80, 90]
new_mut = [0.3, 0.25, 0.2, 0.15]
total = []
best_score = -float('inf')
best_bot = None
no_improve_epochs = 0

curr_time = time.time()
for it in range(epochs):
    if it in eph_change_mut:
        mut = new_mut[eph_change_mut.index(it)]
        print(f"Смена мутации на {mut}\n")

    val = countValue(popul, env, normalize=False)
    newpopul, sval = getSurvPopul(popul, val, nsurv)
    print(f"Эпоха {it}, время: {time.time() - curr_time:.2f} c, топ-3: {sval[:3]}")
    total.append(sval[0])
    curr_time = time.time()

    if sval[0] > best_score:
        best_score = sval[0]
        best_bot = newpopul[0]
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1

    if no_improve_epochs >= 15:
        mut = max(mut * 0.9, 0.1)
        no_improve_epochs = 0

    elite = newpopul[:2]
    children = []
    for _ in range(nnew):
        botp1, botp2 = getParents(newpopul, nsurv)
        newbot = []
        for j in range(len(botp1)):
            gene = crossPointFrom2Parents(botp1, botp2, j).copy()
            for t in range(len(gene)):
                if random.random() < mut:
                    gene[t] += random.gauss(0, 0.1)
            newbot.append(gene)
        children.append(np.array(newbot))
    popul = np.array(elite + children)



  logger.warn(


Эпоха 0, время: 5.27 c, топ-3: [np.float64(-136.3967419469433), np.float64(-150.31964109819083), np.float64(-153.3016678859011)]
Эпоха 1, время: 1.48 c, топ-3: [np.float64(-121.2930792941974), np.float64(-124.81850617797748), np.float64(-143.69457901319515)]
Эпоха 2, время: 1.54 c, топ-3: [np.float64(-142.42605603360172), np.float64(-146.17495546579562), np.float64(-156.29643252581602)]
Эпоха 3, время: 1.01 c, топ-3: [np.float64(-122.09705983183301), np.float64(-136.085626246801), np.float64(-136.65751841278492)]
Эпоха 4, время: 1.76 c, топ-3: [np.float64(-87.54034054491076), np.float64(-121.72311573559773), np.float64(-125.27862303876616)]
Эпоха 5, время: 0.48 c, топ-3: [np.float64(-12.040455128944046), np.float64(-79.19956284470862), np.float64(-102.34807948551945)]
Эпоха 6, время: 1.27 c, топ-3: [np.float64(-115.30559249600269), np.float64(-117.08725760089514), np.float64(-117.58222666180647)]
Эпоха 7, время: 0.40 c, топ-3: [np.float64(-69.22374813129056), np.float64(-91.28320452119

In [None]:
# === График ===
plt.plot(total)
plt.title("Эволюция лучшего вознаграждения")
plt.xlabel("Эпоха")
plt.ylabel("Вознаграждение")
plt.grid(True)
plt.show()



In [None]:
# === Тест финального лучшего бота ===
observation, _ = env.reset()
action = 0
done = False
while not done:
    observation, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    result = np.dot(observation, best_bot)
    action = np.argmax(result)

env.close()
show_video()
