In [22]:
import numpy as np

with open('words_250000_train.txt', 'r') as f:
    words = f.read().splitlines()
rng = np.random.default_rng(89)

data = rng.permutation(words)
# data=data[:500]
split_index= int(0.95 * len(data))
train_words = data[:split_index]
val_words = data[-1000:]


In [19]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Bidirectional, LSTM, Dropout, TimeDistributed, Dense
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-05-16 18:28:21.494419: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-16 18:28:21.494481: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-16 18:28:21.495578: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-16 18:28:21.501213: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# class LSTMWordPredictor:
#     def __init__(self, weights_path="lstm_model6.h5", max_word_length=20):
#         self.chars = list("abcdefghijklmnopqrstuvwxyz0")
#         self.char_to_int = {c: i for i, c in enumerate(self.chars)}
#         self.int_to_char = {i: c for i, c in enumerate(self.chars)}
#         self.vocab_size = len(self.chars)
#         self.max_word_length = max_word_length
#         self.model = self.build_model()
#         self.model.load_weights(weights_path)

#     def build_model(self):
#         model = Sequential()
#         model.add(Embedding(input_dim=self.vocab_size, output_dim=64, trainable=True))
#         model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
#         model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.001))))
#         model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.001))))
#         model.add(Dropout(0.4))
#         model.add(TimeDistributed(Dense(self.vocab_size, activation='softmax')))
#         model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#         return model

#     def predict(self, word_with_missing, guessed_letters):
#         word_encoded = [self.char_to_int.get(char, self.char_to_int['0']) for char in word_with_missing]
#         word_padded = pad_sequences([word_encoded], maxlen=self.max_word_length, padding='post')
#         prediction = self.model.predict(word_padded, verbose=0)[0]
#         return prediction
#         best_char = None
#         best_prob = -1

#         for i, char in enumerate(word_with_missing):
#             if char == '0':
#                 probabilities = prediction[i]
#                 return probabilities
#                 for idx in np.argsort(-probabilities):
#                     predicted_char = self.int_to_char[idx]
#                     if predicted_char != '0' and predicted_char not in guessed_letters:
#                         prob = probabilities[idx]
#                         if prob > best_prob:
#                             best_prob = prob
#                             best_char = predicted_char
#                         break
#         return best_char

In [4]:
import xgboost as xgb
import numpy as np
import pandas as pd
import pickle
import string

class HangmanXGBoostPredictor:
    def __init__(self, model_path="xgboost_hangman_models3.pkl", pos_features=65):
        self.alphabet = list(string.ascii_lowercase)
        self.num_classes = len(self.alphabet)
        self.models = []
        self.pos_features = pos_features
        self._load_models(model_path)

    def _load_models(self, model_path):
        with open(model_path, 'rb') as f:
            self.models = pickle.load(f)

    def _encode_word_state(self, word: str, guessed_letters: set) -> np.ndarray:
        feat = np.full(self.pos_features, -1, dtype=np.int8)
        length = len(word)
        offset = self.pos_features - length

        for i, ch in enumerate(word):
            code = 0 if ch == '*' else ord(ch) - ord('a') + 1
            feat[i] = code
            feat[offset + i] = code

        return feat.reshape(1, -1)

    def predict_letter(self, word: str, guessed_letters: set) -> str:
        features = self._encode_word_state(word, guessed_letters)
        scores = []
        # print(self.models)
        for idx, model in enumerate(self.models.items()):
            # print(idx,model)
            model=model[1]
            letter = self.alphabet[idx]
            if letter in guessed_letters:
                scores.append(-np.inf)
                continue
            prob = model.predict_proba(features)[0][1]  # probability of presence
            scores.append(prob)

        return scores


In [20]:
import torch
from transformers import CanineTokenizer, CanineConfig, CanineForSequenceClassification
from transformers import CanineConfig, CanineTokenizer, AutoModelForSequenceClassification

# # Constants for separating and masking in the CANINE input sequence
# CANINE_SEP_TOKEN   = " [SEP] "
# CANINE_MASK_TOKEN  = "[MASK]"

class CanineHangmanPlayer:
    def __init__(self, pretrained_model_path: str = "google/canine-s", device: torch.device = None,xgb=None):
        # Load CANINE tokenizer & config
        self.tokenizer = CanineTokenizer.from_pretrained('google/canine-s')
        self.config    = CanineConfig.from_pretrained(pretrained_model_path)
        self.config.num_labels = 26
        self.xgb=xgb
        self.nice=1
        self.trigger=1
        self.all_letters = [chr(i) for i in range(ord('a'), ord('z')+1)]
        self.state=False
        self.guesses={}

        
        # Set up device
        self.device = "cuda"#device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load a sequence-classification head on top of CANINE
        self.model =AutoModelForSequenceClassification.from_pretrained(
            pretrained_model_path,
            config=self.config).to(self.device)
        
        # Tokens for building the game state string
        self.CANINE_MASK_TOKEN = self.tokenizer.mask_token
        self.CANINE_SEP_TOKEN = self.tokenizer.sep_token
        
        # Toggle for self-play finetuning
        self.training = False

    def eval(self):
        self.model.eval()

    def simulate_hangman_transformers(self, word: str, max_wrong_guesses: int = 6, verbose: int = 1):
        """
        Play hangman against the CANINE model.
        If self.training is True, returns (model_logits_seq, true_dist_seq, success_flag).
        Otherwise returns just susccess_flag.
        """
        # Build index map of letters→positions
        word_idxs = {}
        for i, c in enumerate(word):
            word_idxs.setdefault(c, []).append(i)

        all_letters = [chr(i) for i in range(ord('a'), ord('z')+1)]
        guesses      = {}
        encoded_word = "*" * len(word)
        num_wrong= 0

        self.eval()
        if self.training:
            outputs_model = []
            outputs_true  = []

        # if verbose:
        #     print(f"[WORD]: {word}")


        while encoded_word != word and num_wrong < max_wrong_guesses:
            missing_count = encoded_word.count("*")

            # if missing_count <= 0:
            #     masked = encoded_word.replace("*", "0")
            #     guess = self.lstm_model.predict(masked, guessed_letters=set(guesses.keys()))
            #     print(guess,end=" ")
            # else:
            
            check = self.xgb.predict_letter(''.join(encoded_word), guesses)
            # encoded_word = encoded_word.replace("*", "0")
            # check = self.xgb.predict(''.join(encoded_word), guesses)
            # encoded_word = encoded_word.replace("0", "*")

            
            state = ''.join(guesses.keys()) + self.CANINE_SEP_TOKEN + encoded_word.replace('*', self.CANINE_MASK_TOKEN)
            enc = self.tokenizer(state, padding="max_length", truncation=True, max_length=64, return_tensors="pt").to(self.device)

            with torch.no_grad():
                logits = self.model(**enc).logits

            arr = logits.cpu().numpy()[0]
            arr1=np.argsort(arr)[::-1].tolist()
            total = sum(arr)
            guess_idx = np.argmax(arr)
            arr = [x / total for x in arr]
            arr1=np.argsort(check)[::-1].tolist()
            rank1=np.argsort(check)[::-1].tolist()
            pos1=rank1.index(guess_idx)
            pos2=all_letters[guess_idx]
            # guess = all_letters[guess_idx]
            org=arr[guess_idx]
            verify=check[guess_idx]
            # trigger=0
            trigger=False
            if org<0.005 and pos1>=7 and missing_count<=4:
                # count+=1
                # change=1
                # print("Trigger")
                trigger=True
                # self.trigger+=1
                guess_idx = rank1[0]
            guess = all_letters[guess_idx]


            if not trigger:
                while guess in guesses:
                    arr[guess_idx] = -np.inf 
                    guess_idx = np.argmax(arr)
                    guess = all_letters[guess_idx]
            
            if pos2!=guess and pos2 not in word_idxs and guess not in word_idxs:
                self.trigger-=1
            # Apply the guess
            if guess in word_idxs:
                self.nice+=1 if trigger else 0
                # self.trigger+=1
                for pos in word_idxs[guess]:
                    encoded_word = encoded_word[:pos] + guess + encoded_word[pos+1:]
            else:
                self.trigger+=1 if trigger else 0
                num_wrong += 1
            trigger=False

            guesses[guess] = True
            if verbose == 1:
                print(f"  Guess: {guess.upper():<2} → {encoded_word}  (Wrong: {num_wrong}) {verify:.3f} {org:.3f} {pos1}, 'Org', {pos2} {all_letters[rank1[0]]} ")

        success = (encoded_word == word)
        if verbose == 3 or verbose==1:
            print(f"Result: {'✅ CORRECT' if success else '❌ FAILED'} | Final: {encoded_word} {word}  \n")
        # print(nice/count*100)
        if self.training:
            return torch.vstack(outputs_model), torch.vstack(outputs_true), success
        return success
    
    def test_accuracy(self, words, verbose=1):
        # global nice,trigger
        # nice,trigger=1,1
        n=len(words)
        count=1
        correct = 0
        for w in words:
            print(f"{(correct / count) * 100:.2f} {count} / {n} {self.nice} {self.trigger} ", end="\r" )
            correct += self.simulate_hangman_transformers(w, verbose=verbose)
            count+=1
        return correct / len(words)
    def predict(self,word):
                
            encoded_word=word.replace(" ","").replace("_","*")
            if self.state==True and set(list(word))==set(["*"]):
                self.state=False
                self.guesses={}
                

            check = self.xgb.predict_letter(''.join(encoded_word), self.guesses)
            state = ''.join(guesses.keys()) + self.CANINE_SEP_TOKEN + encoded_word.replace('*', self.CANINE_MASK_TOKEN)
            enc = self.tokenizer(state, padding="max_length", truncation=True, max_length=64, return_tensors="pt").to(self.device)

            with torch.no_grad():
                logits = self.model(**enc).logits

            arr = logits.cpu().numpy()[0]
            arr1=np.argsort(arr)[::-1].tolist()
            
            total = sum(arr)
            guess_idx = np.argmax(arr)
            arr = [x / total for x in arr]
            
            arr1=np.argsort(check)[::-1].tolist()
            rank1=np.argsort(check)[::-1].tolist()
            pos1=rank1.index(guess_idx)
            # guess = all_letters[guess_idx]
            org=arr[guess_idx]
            verify=check[guess_idx]

            if org<0.009 and pos1>=5 and missing_count<=4:
                # print("Trigger")
                trigger=True
                guess_idx = rank1[0]
            guess = all_letters[guess_idx]
            guesses[guess]=True
            return guess

        




In [1]:
xgb = HangmanXGBoostPredictor("xgboost_hangman_models2.pkl")
model=CanineHangmanPlayer("10epoch",xgb=xgb)
model.training=False
print(model.test_accuracy(val_words,verbose=1))

NameError: name 'HangmanXGBoostPredictor' is not defined

In [None]:
xgb = LSTMWordPredictor(weights_path="lstm_model6.h5")
model=CanineHangmanPlayer("10epoch",xgb=xgb)
model.training=False
print(model.test_accuracy(val_words,verbose=1))