In [3]:
import numpy as np

with open('words_250000_train.txt', 'r') as f:
    words = f.read().splitlines()
rng = np.random.default_rng(42)
data = rng.permutation(words)
# data=data[:500]
split_index= int(0.95 * len(data))
train_words = data[:split_index]
val_words = data[-1000:]


In [2]:
import numpy as np
import pandas as pd
import random
from typing import List
from itertools import combinations
from collections import Counter

ALPHABET = "abcdefghijklmnopqrstuvwxyz"
LETTER_TO_INDEX = {ch: i+1 for i, ch in enumerate(ALPHABET)}


def generate_random_subsets(word: str, num_samples: int = 5) -> List[set]:
    unique_letters = list(set(word))
    subsets = set()
    attempts = 0
    
    while len(subsets) < num_samples and attempts < 100:
        sample_size = random.randint(1, len(unique_letters))
        sample = tuple(sorted(random.sample(unique_letters, sample_size)))
        subsets.add(sample)
        attempts += 1
    
    return [set(sub) for sub in subsets]


def create_hangman_df(words: List[str], pos_features: int = 65, samples_per_word: int = 8) -> pd.DataFrame:
    records = []

    for idx, word in enumerate(words):
        print(f"Processing {idx+1}/{len(words)} ({(idx+1)/len(words)*100:.2f}%)", end="\r")

        word_len = len(word)
        if word_len > pos_features:
            continue  # Skip overly long words

        back_offset = pos_features - word_len
        subsets = generate_random_subsets(word, num_samples=samples_per_word)

        for subset in subsets:
            feat = np.full(pos_features, -1, dtype=np.int8)
            labels = np.zeros(len(ALPHABET), dtype=np.int8)

            for i, ch in enumerate(word):
                code = LETTER_TO_INDEX[ch] if ch in subset else 0
                feat[i] = code
                feat[back_offset + i] = code

                if ch not in subset:
                    labels[LETTER_TO_INDEX[ch] - 1] = 1

            records.append(np.concatenate([feat, labels]))

    column_names = [f"X{i}" for i in range(pos_features)] + list(ALPHABET)
    df = pd.DataFrame(records, columns=column_names, dtype=np.int8)
    return df

In [3]:
with open("final.txt") as f:
    word_list = [line.strip().lower() for line in f if line.strip()]

# Generate DataFrame
hangman_df = create_hangman_df(word_list, pos_features=65, samples_per_word=8)

# Save it to disk
# hangman_df.to_parquet("hangman_training_data.parquet")
print("\nSaved 1.2M samples to 'hangman_training_data.parquet'")

Processing 224818/224818 (100.00%)
Saved 1.2M samples to 'hangman_training_data.parquet'


In [None]:
import pandas as pd
import xgboost as xgb
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

# Load the Parquet dataset
# df = pd.read_parquet("hangman_training_data.parquet")  # Change path if needed
df=hangman_df
# Split features and labels
alphabet = 'abcdefghijklmnopqrstuvwxyz'
X = df.drop(columns=list(alphabet))
y = df[list(alphabet)]


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# Train 26 binary classifiers (one for each letter)
models = {}
accuracies = {}

for letter in alphabet:
    print(f"\nTraining for letter '{letter}'...")
    clf = xgb.XGBClassifier(
        objective="binary:logistic",
        n_estimators=500,          # more trees for fine detail
        max_depth=10,               # deeper splits to capture complex patterns
        learning_rate=0.03,        # lower step size for smoother convergence
        colsample_bytree=0.8,      # use 80% of features per tree
        subsample=0.7,             # use 70% of rows per tree
        gamma=1.0,                 # require min loss reduction to split
        reg_alpha=1.0,             # L1 regularization
        reg_lambda=2.0,            # L2 regularization
        tree_method="hist",        
        verbosity=0,
        n_jobs=-1
    )  
    clf.fit(X_train, y_train[letter])
    models[letter] = clf

    # Evaluate
    preds = clf.predict(X_test)
    acc = balanced_accuracy_score(y_test[letter], preds)
    accuracies[letter] = acc
    print(f"Balanced Accuracy: {acc:.4f}")

# Save all models in one file
with open("xgboost_hangman_models2.pkl", "wb") as f:
    pickle.dump(models, f)



Training for letter 'a'...
Balanced Accuracy: 0.7282

Training for letter 'b'...
Balanced Accuracy: 0.5251

Training for letter 'c'...
Balanced Accuracy: 0.5704

Training for letter 'd'...
Balanced Accuracy: 0.5542

Training for letter 'e'...
Balanced Accuracy: 0.8206

Training for letter 'f'...
Balanced Accuracy: 0.5069

Training for letter 'g'...
Balanced Accuracy: 0.5617

Training for letter 'h'...
Balanced Accuracy: 0.5364

Training for letter 'i'...
Balanced Accuracy: 0.7811

Training for letter 'j'...
Balanced Accuracy: 0.5000

Training for letter 'k'...


In [13]:
#Inference code


import xgboost as xgb
import numpy as np
import pandas as pd
import pickle
import string

class HangmanXGBoostPredictor:
    def __init__(self, model_path="xgboost_hangman_models2.pkl", pos_features=65):
        self.alphabet = list(string.ascii_lowercase)
        self.num_classes = len(self.alphabet)
        self.models = []
        self.pos_features = pos_features
        self._load_models(model_path)

    def _load_models(self, model_path):
        with open(model_path, 'rb') as f:
            self.models = pickle.load(f)

    def _encode_word_state(self, word: str, guessed_letters: set) -> np.ndarray:
        feat = np.full(self.pos_features, -1, dtype=np.int8)
        length = len(word)
        offset = self.pos_features - length

        for i, ch in enumerate(word):
            code = 0 if ch == '*' else ord(ch) - ord('a') + 1
            feat[i] = code
            feat[offset + i] = code

        return feat.reshape(1, -1)

    def predict_letter(self, word: str, guessed_letters: set) -> str:
        features = self._encode_word_state(word, guessed_letters)
        scores = []
        # print(self.models)
        for idx, model in enumerate(self.models.items()):
            # print(idx,model)
            model=model[1]
            letter = self.alphabet[idx]
            if letter in guessed_letters:
                scores.append(-np.inf)
                continue
            prob = model.predict_proba(features)[0][1]  # probability of presence
            scores.append(prob)

        return self.alphabet[np.argmax(scores)]


In [14]:
def simulate_hangman(model, word, max_wrong=6, verbose=1):
    guessed = set()
    masked = ["*"] * len(word)
    wrong = 0

    while ''.join(masked) != word and wrong < max_wrong:
        guess = model.predict_letter(''.join(masked), guessed)
        guessed.add(guess)

        if guess in word:
            for idx, ch in enumerate(word):
                if ch == guess:
                    masked[idx] = guess
        else:
            wrong += 1

        if verbose:
            print(f"Guess: {guess} → {''.join(masked)} | Wrong: {wrong}")

    return ''.join(masked) == word


In [None]:
# words_to_test = ["hangman", "puzzle", "guitar", "machine", "language"]

predictor = HangmanXGBoostPredictor("xgboost_hangman_models2.pkl")
correct = 0
for word in val_words:
    print(f"Testing: {word}")
    success = simulate_hangman(predictor, word, verbose=0)
    print("✅ Correct\n" if success else "❌ Failed\n")
    correct += success

print(f"Accuracy: {correct} ")
