In [45]:
import gymnasium as gym
from gymnasium import Env 
from gymnasium.spaces import MultiDiscrete, Discrete
# from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import SubprocVecEnv
import numpy as np
import torch 
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, CallbackList
from stable_baselines3.common.vec_env import VecTransposeImage, DummyVecEnv

In [46]:
# create a mapping from characters to integers and vice versa
chars = list(' #abcdefghijklmnopqrstuvwxyz,_')

stoi = {ch:i for i, ch in enumerate(chars)} # abbrev for string to integer
itos = {i:ch for ch, i in stoi.items()}

# encoder: takes a string and returns a list of integers
encode = lambda s: [stoi[c] for c in s.ljust(40)]

# decoder: takes a list of integers and returns a string
decode = lambda l: ''.join([itos[i] for i in l])

#unit test
print(encode('hi#there,a'))
print(decode(encode('hi#there,a')))
print(stoi)

[9, 10, 1, 21, 9, 6, 19, 6, 28, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
hi#there,a                              
{' ': 0, '#': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27, ',': 28, '_': 29}


In [48]:

class HangmanServer(Env):
    def __init__(self):
        with open('words_250000_train.txt') as f:
            self.test_words = f.read().split('\n')
        self.shuffle()
        self.action_space = Discrete(26)
        self.observation_space = MultiDiscrete([29]*40)
        self.itr = 0
        self.state = list('#'*len(self.test_words[self.itr])+',')
        self.rem_guesses = 6

    def render(self):
        pass

    def shuffle(self):
        np.random.shuffle(self.test_words)

    def reset(self, seed = 0):
        self.rem_guesses = 6
        self.itr = self.itr + 1
        self.state = list('#'*len(self.test_words[self.itr])+',')
        return encode("".join(self.state)), {}

    def step(self, action = None, out = False):
        if action == None:
            action = chr(np.random.randint(97, 123))
        else:
            action = chr(action+97)
        reward = 0
        num_blanks = 0
        done = 0
        for itr in range(len(self.test_words[self.itr])):
            if self.state[itr] == '#':
                num_blanks = num_blanks + 1
                if self.test_words[self.itr][itr] == action:
                    self.state[itr] = action
                    reward = 1
                    num_blanks = num_blanks - 1
        # if reward == 0:
            # self.state.append(action)
        

        state = "".join(self.state)
        if out:
            print(state, action)
        if reward==0:
            self.rem_guesses -= 1
            
        if num_blanks == 0 or self.rem_guesses == 0:
            done = 1
            if num_blanks ==0: reward = 10
            self.reset()
        try:
            encode("".join(self.state))
        except:
            print(self.state, action)
        return encode("".join(self.state)), reward, done, done, {}
            


        
env = HangmanServer()



# for itr in range(20):
#     state, reward, done, _, _ = env.step(out = True)
#     print(done)
#     print(state)


In [53]:
import os 
log_dir = "./logs/"
os.makedirs(log_dir, exist_ok=True)
num_envs = 8
vec_env = SubprocVecEnv([lambda: Monitor(HangmanServer(), log_dir) for i in range(num_envs)], start_method='fork')
# for itr in range(2):
#     state, reward, done, _ = vec_env.step([1]*2)
#     print(done)
#     print(state)

In [50]:
# Initialize the DQN agent
# model = DQN("MlpPolicy", vec_env, verbose=0, learning_rate=0.001, buffer_size=50000)
policy_kwargs = dict(
    activation_fn=torch.nn.ReLU,
    net_arch=dict(pi=[256, 128], vf=[256, 128]),
    lstm_hidden_size=128,
    n_lstm_layers=1,
    shared_lstm=True,
    enable_critic_lstm=False,
)
# model = RecurrentPPO(
#     "MlpLstmPolicy",
#     vec_env,
#     policy_kwargs=policy_kwargs,
#     learning_rate=1e-4,
#     n_steps=256,
#     batch_size=128,
#     n_epochs=4,
#     gamma=0.99,
#     gae_lambda=0.95,
#     clip_range=0.2,
#     ent_coef=0.01,
#     vf_coef=0.5,
#     max_grad_norm=0.5,
#     verbose=0)
# # Train the agent
# model.learn(total_timesteps=10)

In [54]:
# del model
model = RecurrentPPO(
    "MlpLstmPolicy",
    vec_env,
    policy_kwargs=policy_kwargs,
    learning_rate=1e-5,
    n_steps=256,
    batch_size=128,
    n_epochs=4,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    verbose=0,
    tensorboard_log=log_dir)

# model.set_parameters(load_path_or_dict="ppo_recurrent")

eval_env = DummyVecEnv([lambda: Monitor(HangmanServer(), log_dir)])  # Replace "env" with your environment
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=log_dir,
    log_path=log_dir,
    eval_freq=5000//num_envs,  # Evaluate every 2000 timesteps
    deterministic=True,
    render=False,
    verbose=1,
    n_eval_episodes = 100
    
)

# 5. Create callback list (add more callbacks if needed)
callbacks = CallbackList([eval_callback])



In [55]:
# 6. Train with callbacks
model.learn(
    total_timesteps=200000,
    callback=callbacks,
    tb_log_name="ppo_recurrent"  # TensorBoard experiment name
)



Eval num_timesteps=5000, episode_reward=0.74 +/- 0.48
Episode length: 6.74 +/- 0.48
New best mean reward!
Eval num_timesteps=10000, episode_reward=0.71 +/- 0.45
Episode length: 6.71 +/- 0.45
Eval num_timesteps=15000, episode_reward=0.66 +/- 0.47
Episode length: 6.66 +/- 0.47
Eval num_timesteps=20000, episode_reward=0.67 +/- 0.47
Episode length: 6.67 +/- 0.47
Eval num_timesteps=25000, episode_reward=0.70 +/- 0.46
Episode length: 6.70 +/- 0.46
Eval num_timesteps=30000, episode_reward=0.70 +/- 0.46
Episode length: 6.70 +/- 0.46
Eval num_timesteps=35000, episode_reward=0.61 +/- 0.49
Episode length: 6.61 +/- 0.49
Eval num_timesteps=40000, episode_reward=0.60 +/- 0.49
Episode length: 6.60 +/- 0.49
Eval num_timesteps=45000, episode_reward=0.72 +/- 0.45
Episode length: 6.72 +/- 0.45
Eval num_timesteps=50000, episode_reward=0.63 +/- 0.48
Episode length: 6.63 +/- 0.48
Eval num_timesteps=55000, episode_reward=0.67 +/- 0.47
Episode length: 6.67 +/- 0.47
Eval num_timesteps=60000, episode_reward=0.6

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x149926c90>

In [32]:
model.save("ppo_recurrent")

In [31]:
env = HangmanServer()

obs, _ = env.reset()
lstm_states = None
episode_starts = True
wins = 0
games = 0
tot_reward = 0
ngames = 500
while games<ngames:
    # print(episode_starts, obs)

    action, lstm_states = model.predict(
                obs,
                state=lstm_states,
                episode_start=episode_starts,
                deterministic=True
            )
    obs, reward, done, _, _ = env.step(action = action, out = False)
   
    episode_starts = done
    tot_reward += reward
    if done==1:
        if reward==10:
            wins+=1
            games+=1
        else:
            games+=1


print('Win Rate: ', wins/games)
print('Episodic Reward: ', tot_reward/games)


Win Rate:  0.092
Episodic Reward:  6.078


In [74]:
# model.save("ppo_recurrent")
# del model # remove to demonstrate saving and loading

# model = RecurrentPPO.load("ppo_recurrent")

In [42]:
# from stable_baselines3.common.evaluation import evaluate_policy

# mean_reward, std_reward = evaluate_policy(model, vec_env, n_eval_episodes=1000)
# print(f"Mean Reward: {mean_reward}, Std Reward: {std_reward}")