In [None]:
import numpy as np
import gym
from stable_baselines3 import PPO
import math
import itertools
import datetime

In [None]:
N = 5
rand = np.random.RandomState(1)
M = np.round(rand.normal(5, 1.5, size=(N, N)),2)
M[np.eye(N)==1] = 0
M

In [None]:
math.factorial(N-1)

In [None]:
x_min = []
g_min = 100000000
print(datetime.datetime.now())
for i, x in enumerate(itertools.permutations(range(1,N), N-1)):
    x = (0,) + x
    g = np.sum(M[x[:-1], x[1:]])+M[x[-1], x[0]]
    if g < g_min:
        g_min = g
        x_min = x
    if i % 1000 == 0:
        print(i, x_min, g_min)
print(datetime.datetime.now())
print('Optimal solution:')
print(x_min, g_min)

In [None]:
class MyEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.n = N-1
        self.action_space = gym.spaces.Discrete(self.n)
        self.observation_space = gym.spaces.Dict({
            'visited': gym.spaces.MultiBinary(self.n), 
            'last': gym.spaces.Discrete(N)})   
        
    def reset(self):
        self.state = {'visited': np.zeros(self.n), 'last': 0}
        visited = np.zeros(self.n)
        return self.state

    def step(self, action):
        if self.state['visited'][action] == 1:
            self.reward = -10
        else:
            self.state['visited'][action] = 1
            self.reward = - M[self.state['last'], action + 1]
            self.state['last'] = action + 1
        if np.all(self.state['visited'] == 1): 
            self.reward += - M[action + 1, 0]    
            self.done = True
        else:
            self.done = False
            
        return self.state, self.reward, self.done, {}

In [None]:
env = MyEnv()

In [None]:
print(datetime.datetime.now())
model = PPO("MultiInputPolicy", env, verbose=1)
model.learn(total_timesteps=20000)
print(datetime.datetime.now())

In [None]:
obs = env.reset()
g = 0
actions = [0]
for i in range(100000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    g += M[actions[-1], action + 1]
    actions.append(action + 1)
    if done:
        obs = env.reset()
        g += M[actions[-1], 0]
        print(f"{actions}, g = {g}")
        break
env.close()