# Environment
First we need to describe the environment in which the model is going to be trained (relevant informations to do so [here](https://gymnasium.farama.org/tutorials/gymnasium_basics/environment_creation/))

In [1]:
import numpy as np
import random
import itertools as it
from sympy.combinatorics import Permutation
import gymnasium as gym
from gymnasium import spaces



COLOR_MAP = {
    "W" :0,
    "G" :1,
    "R" :2,
    "B" :3,
    "O" :4,
    "Y" :5
    }

FACES = ["WWWW", "GGGG", "RRRR", "BBBB", "OOOO", "YYYY"]
SOLVED_STATE_COLOR = [''.join(faces) for faces in it.permutations(FACES)]
SOLVED_STATE_INDEX = np.empty((720,24))

for i,s in enumerate(SOLVED_STATE_COLOR):
    for j,c in enumerate(s):
        SOLVED_STATE_INDEX[i,j] = COLOR_MAP[c]

class Cube2x2Env(gym.Env):

    def __init__(self):
        self.move_count = 0
        self._action_to_move = {
            0: Permutation(23)(2, 19, 21, 8)(3, 17, 20, 10)(4, 6, 7, 5),
            1: Permutation(0, 18, 23, 9)(1, 16, 22, 11)(12, 13, 15, 14),
            2: Permutation(1, 5, 21, 14)(3, 7, 23, 12)(8, 10, 11, 9),
            3: Permutation(23)(0, 4, 20, 15)(2, 6, 22, 13)(16, 17, 19, 18),
            4: Permutation(6, 18, 14, 10)(7, 19, 15, 11)(20, 22, 23, 21),
            5: Permutation(23)(0, 1, 3, 2)(4, 16, 12, 8)(5, 17, 13, 9)
        }

        self.steps_from_solved = 1
        
        self.action_space = spaces.Discrete(6)
        self.observation_space = spaces.Box(0, 5, shape=(24,), dtype=np.uint8)
   
        self.state = self.scramble()

    def step(self, action):
        truncated = False
        move = self._action_to_move[action]
        self.state = move(self.state)
        self.move_count += 1
        # Calculate reward
        if np.any(np.all(self.state == SOLVED_STATE_INDEX, axis=1)): 
            reward = 1000
            done = True
        else: 
            done = False
            reward = -1 
        
        if self.move_count > 1000:
            truncated = True
        # Return step information
        return np.array(self.state), reward, done, truncated, {}
    
    def scramble(self):
        state = SOLVED_STATE_INDEX[0]
        for i in range(self.steps_from_solved):
            move = self._action_to_move[random.randint(0,5)]
            state = move(state)
        if np.any(np.all(state == SOLVED_STATE_INDEX, axis=1)):
            self.scramble()
        return np.array(state, dtype=np.uint8)
    
    def reset(self, seed=None):
        # Reset shower temperature
        self.state = self.scramble()
        # Reset shower time
        self.move_count = 0 
        return self.state, {}


        

    def __repr__(self):
        ascii = '''
         +--------+                    
         | {0}    {1} |                    
         |   d1   |                    
         | {2}    {3} |                    
+--------+--------+--------+--------+  
| {16}    {17} | {4}    {5} | {8}    {9} | {12}    {13} |  
|   r1   |   f0   |   r0   |   f1   |  
| {18}    {19} | {6}    {7} | {10}    {11} | {14}    {15} |  
+--------+--------+--------+--------+  
         | {20}    {21} |                    
         |   d0   |                    
         | {22}    {23} |                    
         +--------+                    

      '''
        return ascii.format(*self.state)

# Model
We can initialize a new model

In [2]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
env = Cube2x2Env()
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log='logs/')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  from .autonotebook import tqdm as notebook_tqdm


Or start from a previous version by loading the corresponding file (This one comes from a previous version of this notebook)

In [4]:
model = PPO.load("cube-solver-1.0.zip", env, tensorboard_log='logs/',  learning_rate = 0.00003)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Since solving the 2x2 cube from any state is way too hard for the model at first, my idea is to train the model for 1e6 iterations at n-step from the start, starting from 1 to 20. We save the model after each loop. So far, this (quite naive) idea has not been terrible. While the method is slow, it is getting increasingly better results. 

In [5]:
for i in range(5):
    env.steps_from_solved = i
    model.learn(total_timesteps=10000000)
    model.save('cube-solver-1.' + i)

Logging to logs/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -1e+03   |
| time/              |          |
|    fps             | 690      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------


In [14]:
env.steps_from_solved = 11
obs = env.reset()[0];obs

array([2, 0, 5, 0, 1, 3, 5, 1, 4, 2, 0, 4, 3, 5, 5, 0, 3, 2, 1, 1, 4, 2,
       4, 3], dtype=uint8)

In [15]:
dones = False
steps = 0
while not dones:
    action, _states = model.predict(obs)
    obs, rewards, dones, _, info = env.step(action.item())
    steps += 1 
print(steps)

5231
