Vocabulary

In [None]:
import nltk
nltk.download('words')
from nltk.corpus import words

vocab = [w.lower() for w in words.words() if len(w) == 5 and w.isalpha()]
vocab = list(set(vocab))
print('Length of vocabulary -', len(vocab))

Length of vocabulary - 10


[nltk_data] Downloading package words to /home/plaksha/nltk_data...
[nltk_data]   Package words is already up-to-date!


Wordle Environment

In [8]:
def word_gen_fn():
    import random
    return random.choice(vocab)

class WordleEnv:
    def __init__(self,word_gen_fn):
        self.word_gen_fn = word_gen_fn
        self.max_turns = 6

    def reset(self):
        self.answer = self.word_gen_fn()
        self.history = []
        return ""  # empty state initially

    def _generate_feedback(self, guess, answer):
        feedback = ["⬛"] * 5
        answer_chars = list(answer)
        used = [False] * 5

        # First pass: correct letters (green)
        for i in range(5):
            if guess[i] == answer[i]:
                feedback[i] = "🟩"
                used[i] = True

        # Second pass: present letters (yellow)
        for i in range(5):
            if feedback[i] == "🟩":
                continue
            for j in range(5):
                if guess[i] == answer_chars[j] and not used[j]:
                    feedback[i] = "🟨"
                    used[j] = True
                    break

        return "".join(feedback)

    def step(self, guess):
        feedback = self._generate_feedback(guess, self.answer)
        self.history.append((guess, feedback))
        done = guess == self.answer or len(self.history) >= self.max_turns
        solved = guess == self.answer

        # Reward shaping
        base_reward = (
            0.1 * feedback.count("🟩") + 
            0.05 * feedback.count("🟨")
        )
        if solved:
            base_reward += 1 / (1 + len(self.history))

        binary_success = 1.0 if solved else 0.0
        total_reward = base_reward + binary_success
        return feedback, done, total_reward


Policy Network 

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class WordlePolicy(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        # +4 = vocab + 3 color tokens + 1 START token
        self.embed = nn.Embedding(vocab_size + 1, embed_dim)
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, state_seq):  # state_seq = tokenized sequence of guess-feedback
        x = self.embed(state_seq)
        _, h = self.rnn(x)
        logits = self.fc(h.squeeze(0))
        return F.log_softmax(logits, dim=-1)

Tokenizer + Vocab Indexing 

In [10]:
# Vocab mappings
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}

# Simple tokenizer for (guess, feedback) history
def tokenize_history(history):
    # Each guess-feedback pair becomes a sequence of integers
    # e.g., crane🟩⬛🟨⬛⬛ becomes token ids [word_idx, green, black, yellow, ...]
    color2id = {"⬛": 0, "🟨": 1, "🟩": 2}
    tokens = [len(vocab) + 3]  # START token at index = len(vocab) + 3
    for guess, feedback in history:
        tokens.append(word2idx[guess])
        tokens.extend([len(vocab) + color2id[c] for c in feedback])  # color ids shifted
    return torch.tensor(tokens, dtype=torch.long).unsqueeze(0)  # Ensure LongTensor

Training Loop

In [11]:
policy = WordlePolicy(vocab_size=len(vocab) + 3, embed_dim=32, hidden_dim=64)
optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)
env = WordleEnv(word_gen_fn)

num_episodes = 50000
print_every = 100
total_rewards = []

for episode in range(num_episodes):
    env.reset()
    log_probs = []
    reward = 0

    for turn in range(6):
        state_tokens = tokenize_history(env.history)
        log_action_probs = policy(state_tokens)[..., :len(vocab)]  # restrict to words only
        
        action_dist = torch.distributions.Categorical(logits=log_action_probs)

        action = action_dist.sample()
        log_prob = action_dist.log_prob(action)

        guess_word = idx2word[action.item()]
        feedback, done, reward = env.step(guess_word)

        log_probs.append(log_prob)
        if done:
            break

    # REINFORCE loss: negative log-prob × reward
    loss = -torch.stack(log_probs).sum() * reward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_rewards.append(reward)

    if (episode + 1) % print_every == 0:
        avg_reward = sum(total_rewards[-print_every:]) / print_every
        print(f"Episode {episode + 1}, Avg Reward: {avg_reward:.3f}")


Episode 100, Avg Reward: 0.934
Episode 200, Avg Reward: 0.826
Episode 300, Avg Reward: 0.798
Episode 400, Avg Reward: 0.834
Episode 500, Avg Reward: 0.930
Episode 600, Avg Reward: 0.869
Episode 700, Avg Reward: 0.888
Episode 800, Avg Reward: 0.833
Episode 900, Avg Reward: 1.195
Episode 1000, Avg Reward: 0.898
Episode 1100, Avg Reward: 0.841
Episode 1200, Avg Reward: 0.845
Episode 1300, Avg Reward: 0.989
Episode 1400, Avg Reward: 0.957
Episode 1500, Avg Reward: 0.932
Episode 1600, Avg Reward: 0.884
Episode 1700, Avg Reward: 1.109
Episode 1800, Avg Reward: 1.122
Episode 1900, Avg Reward: 1.085
Episode 2000, Avg Reward: 1.325
Episode 2100, Avg Reward: 1.305
Episode 2200, Avg Reward: 1.235
Episode 2300, Avg Reward: 1.140
Episode 2400, Avg Reward: 1.257
Episode 2500, Avg Reward: 1.291
Episode 2600, Avg Reward: 1.315
Episode 2700, Avg Reward: 1.466
Episode 2800, Avg Reward: 1.533
Episode 2900, Avg Reward: 1.376
Episode 3000, Avg Reward: 1.288
Episode 3100, Avg Reward: 1.430
Episode 3200, Avg

In [16]:
def test_agent(policy, env, vocab, idx2word, word2idx, episodes=100):
    policy.eval()
    successes = 0
    total_turns = 0
    total_rewards = []

    with torch.no_grad():
        for _ in range(episodes):
            env.reset()
            for turn in range(6):
                state_tokens = tokenize_history(env.history)
                if state_tokens.shape[1] == 0:
                    state_tokens = torch.tensor([[len(vocab) + 3]], dtype=torch.long)  # START token

                log_action_probs = policy(state_tokens)[..., :len(vocab)]
                action_dist = torch.distributions.Categorical(logits=log_action_probs)
                action = action_dist.sample()
                guess_word = idx2word[action.item()]

                feedback, done, reward = env.step(guess_word)

                if done:
                    if guess_word == env.answer:
                        successes += 1
                        total_turns += len(env.history)
                    total_rewards.append(reward)
                    break

    avg_reward = sum(total_rewards) / episodes
    avg_turns = total_turns / successes if successes > 0 else None
    print(f"\n--- Test Results ---")
    print(f"Success rate: {successes}/{episodes} ({successes/episodes:.2%})")
    print(f"Avg reward: {avg_reward:.3f}")
    if avg_turns is not None:
        print(f"Avg turns (successful games): {avg_turns:.2f}")
    else:
        print("No games were solved.")

    policy.train()  # Reset back to training mode


In [17]:
test_agent(policy, env, vocab, idx2word, word2idx, episodes=100)


--- Test Results ---
Success rate: 98/100 (98.00%)
Avg reward: 1.769
Avg turns (successful games): 2.59
