In [4]:
with open("w.txt") as f:
     lines = [line.rstrip('\n') for line in f]
lines

['aaa',
 'aaaaaa',
 'aaas',
 'aachen',
 'aaee',
 'aag',
 'aahed',
 'aahs',
 'aal',
 'aalesund',
 'aaliis',
 'aalst',
 'aam',
 'aandahl',
 'aao',
 'aapss',
 'aar',
 'aarau',
 'aardvark',
 'aardwolf',
 'aaren',
 'aargh',
 'aarika',
 'aaronic',
 'aaronite',
 'aaronsburg',
 'aarp',
 'aarrghh',
 'aas',
 'aasvogels',
 'aaup',
 'aavso',
 'aba',
 'ababdeh',
 'abac',
 'abacas',
 'abacaxi',
 'abaci',
 'abacination',
 'abaciscus',
 'aback',
 'abaco',
 'abacterial',
 'abactinally',
 'abactor',
 'abaculus',
 'abacuses',
 'abada',
 'abaddon',
 'abadengo',
 'abadite',
 'abaft',
 'abagail',
 'abailard',
 'abaised',
 'abaisse',
 'abaka',
 'abakas',
 'abalation',
 'abalienated',
 'abalienation',
 'abalones',
 'abamp',
 'abamperes',
 'abana',
 'abandon',
 'abandoned',
 'abandonee',
 'abandoners',
 'abandonment',
 'abandons',
 'abanet',
 'abanic',
 'abantes',
 'abaptiston',
 'abarambo',
 'abaris',
 'abarticular',
 'abas',
 'abased',
 'abasedness',
 'abasements',
 'abasers',
 'abasgi',
 'abashed',
 'abashe

In [None]:
from transformers import BertForMaskedLM, BertConfig
import torch

# Define the character-level tokenizer
class CharacterTokenizer:
    def __init__(self):
        # Initialize vocabulary with individual characters a-z, and special tokens
        self.vocab = {chr(i): i - 97 for i in range(97, 123)}  # a-z lowercase letters
        self.vocab['[MASK]'] = len(self.vocab)  # Mask token
        self.vocab['[PAD]'] = len(self.vocab) + 1  # Padding token
        self.vocab['[UNK]'] = 0  # Unknown token

    def tokenize(self, word):
        # Convert word to a list of characters and replace '_' with '[MASK]'
        return [char if char != '_' else '[MASK]' for char in word]

    def convert_tokens_to_ids(self, tokens):
        # Convert tokens to their corresponding indices, use [UNK] if not in vocab
        return [self.vocab.get(t, self.vocab['[UNK]']) for t in tokens]

    def convert_ids_to_tokens(self, ids):
        # Convert ids back to tokens
        inv_vocab = {v: k for k, v in self.vocab.items()}
        return [inv_vocab.get(i, '[UNK]') for i in ids]

# Initialize the tokenizer
tokenizer = CharacterTokenizer()

# Function to truncate or pad input to a fixed length
def pad_or_truncate(input_ids, max_len=15, pad_token_id=tokenizer.vocab['[PAD]']):
    if len(input_ids) > max_len:
        return input_ids[:max_len]  # Truncate if input is too long
    else:
        return input_ids + [pad_token_id] * (max_len - len(input_ids))  # Pad if input is too short

# Initialize BERT model configuration with vocab size and maximum word length
max_word_length = 15  # Set this based on your preferred word length or adjust dynamically
config = BertConfig(
    vocab_size=len(tokenizer.vocab),
    max_position_embeddings=max_word_length,  # Fixed to 15 or adjust based on longest word
    hidden_size=256,
    num_attention_heads=4,
    num_hidden_layers=4
)
model = BertForMaskedLM(config)

# Fine-tuning process with padding and truncating
def fine_tune_model(training_data, model, tokenizer, max_len=15):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    model.train()

    for word in training_data:
        tokenized_input = tokenizer.tokenize(word)
        input_ids = tokenizer.convert_tokens_to_ids(tokenized_input)
        input_ids = pad_or_truncate(input_ids, max_len=max_len)
        labels = input_ids.copy()

        # Mask some random characters (15% chance of masking each character)
        for i in range(len(input_ids)):
            if input_ids[i] != tokenizer.vocab['[PAD]'] and torch.rand(1).item() < 0.15:  # Mask 15% of characters
                input_ids[i] = tokenizer.vocab['[MASK]']

        # Convert input and labels to tensor format
        inputs = torch.tensor([input_ids])
        labels = torch.tensor([labels])

        # Forward pass and loss calculation
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Example fine-tuning process (replace with your own training dictionary)
training_dictionary = lines
fine_tune_model(training_dictionary, model, tokenizer)

# Function to guess the next letter in Hangman
def guess_next_letter(current_word, guessed_letters, model, tokenizer, max_len=15):
    tokenized_input = tokenizer.tokenize(current_word)
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_input)
    input_ids = pad_or_truncate(input_ids, max_len=max_len)
    inputs = torch.tensor([input_ids])

    # Get predictions for masked positions
    model.eval()
    with torch.no_grad():
        outputs = model(inputs)

    predictions = outputs.logits[0]

    # Find the best prediction that hasn't been guessed yet
    for idx, token in enumerate(tokenized_input):
        if token == '[MASK]':
            char_logits = predictions[idx]
            sorted_indices = torch.argsort(char_logits, descending=True)
            for pred_idx in sorted_indices:
                predicted_char = tokenizer.convert_ids_to_tokens([pred_idx.item()])[0]
                if predicted_char not in guessed_letters:
                    return predicted_char

# Example usage in a game loop
current_word = '_ e _ _ e t _'
guessed_letters = {'e', 't'}
next_guess = guess_next_letter(current_word, guessed_letters, model, tokenizer)
print(f"Next guessed letter: {next_guess}")