In [None]:
import nltk
from nltk.corpus import words, wordnet
from nltk.tokenize import word_tokenize
from collections import Counter
import pandas as pd
import re

def generate_crossword_words():
    # Download required NLTK data
    nltk.download('words')
    nltk.download('wordnet')
    nltk.download('brown')
    nltk.download('punkt')
    
    # Get all English words
    word_list = set(words.words())
    
    # Function to check if word is suitable for crosswords
    def is_suitable_word(word):
        # Must be at least 3 letters
        if len(word) < 3:
            return False
        
        if len(word) > 11:
            return False
            
        # Must be all letters (no numbers or special characters)
        if not word.isalpha():
            return False
            
        # Must be in common usage (using WordNet presence as a proxy)
        if not wordnet.synsets(word):
            return False
            
        return True
    
    # Filter initial word list
    suitable_words = {word.lower() for word in word_list if is_suitable_word(word)}
    
    # Load Brown corpus for frequency analysis
    from nltk.corpus import brown
    brown_words = [word.lower() for word in brown.words() 
                   if word.lower() in suitable_words]
    
    # Get word frequencies
    word_freq = Counter(brown_words)
    
    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame.from_dict(word_freq, orient='index', 
                               columns=['frequency']).reset_index()
    df.columns = ['word', 'frequency']
    
    # Filter out stopwords
    from nltk.corpus import stopwords
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    df = df[~df['word'].isin(stop_words)]
    
    # Sort by frequency and take top 30,000
    df = df.sort_values('frequency', ascending=False).head(30000)
    
    # Add word length column
    df['length'] = df['word'].str.len()
    
    # Save to CSV
    df.to_csv('crossword_words.csv', index=False)

    return df

df = generate_crossword_words()
df.head(10)

In [None]:
import google.generativeai as genai
import anthropic
from openai import OpenAI
import pandas as pd
from dotenv import load_dotenv
import os
import random
from tabulate import tabulate

load_dotenv()

def generate_gemini_clue(word, model, style_prompt):
    """Generate clue using Gemini"""
    try:
        response = model.generate_content(f"{style_prompt}. Use this style for the word {word}. Reply only with the clue. Do not include a number in brackets or emojis.")
        return response.text.strip().strip('"')
    except Exception as e:
        return f"Error: {str(e)}"

def generate_anthropic_clue(word, client, style_prompt):
    """Generate clue using Anthropic's Claude"""
    try:
        message = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": f"{style_prompt}. Use this style for the word {word}. Reply only with the clue. Do not include a number in brackets of emojis."
                }
            ]
        )
        content = message.content[0].text if isinstance(message.content, list) else message.content
        return str(content).strip().strip('"')
    except Exception as e:
        return f"Error: {str(e)}"

def generate_openai_clue(word, client, style_prompt):
    """Generate clue using OpenAI"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a crossword clue writer. Reply only with the clue.. Do not include a number in brackets or emojis."},
                {"role": "user", "content": f"{style_prompt}. Use this style for the word {word}."}
            ]
        )
        return response.choices[0].message.content.strip().strip('"')
    except Exception as e:
        return f"Error: {str(e)}"

def get_style_prompt():
    """Randomly select a clue style based on specified probabilities"""
    rand = random.random()
    
    if rand < 0.6:  # 60% easy clues
        return ("Create an easy but clever 2-4 word crossword clue. "
                "The clue should be straightforward but engaging. "
                "For example, 'Night bird' for OWL."), "Easy"
    
    elif rand < 0.75:  # 15% moderate clues
        return ("Create a moderately challenging 2-4 word crossword clue. "
                "The clue should require some thought but not be obscure. "
                "For example, 'Desert wanderer' for CAMEL."), "Moderate"
    
    elif rand < 0.85:  # 10% fill in blank
        return ("Create a fill-in-the-blank crossword clue using ___ notation. "
                "For example, '___ and done' for the answer SAID. "
                "Use exactly this format with the underscores. Only one word should be blank."), "Fill-in-blank"
    
    elif rand < 0.925:  # 7.5% wordplay with ?
        return ("Create an easy wordplay crossword clue that ends with a question mark. Ensure these clues are relatively easy while still being clever."
                "For example, 'Flying home?' for the answer NEST. "
                "The question mark signals wordplay to the solver."), "Wordplay"
    
    else:  # 7.5% perhaps with examples
        return ("Create a crossword clue by using an example of the target word (if possible). Follow the clue with ,perhaps so it is clear it's an example. "
                "For example, 'Fedoras, perhaps' for the answer HATS. "
                "Use exactly this format with ', perhaps' at the end."), "Perhaps"

In [None]:
from typing import List, Dict
from tqdm import tqdm


def evaluate_clues(word: str, clue_type: str, clues: Dict[str, str], anthropic_client) -> tuple[str, str, str]:
    """Have Claude evaluate and rank the clues"""
    evaluation_prompt = f"""You are an expert crossword editor. Evaluate these crossword clues for the word "{word}".
    Clue type: {clue_type}
    
    Clues to evaluate:
    1. Gemini: {clues['Gemini']}
    2. Claude: {clues['Claude']}
    3. GPT-4o: {clues['GPT-4o']}
    
    Consider these criteria:
    - Accuracy and appropriateness
    - Adherence to clue type format
    - Clarity and fairness
    - Avoidance of the answer in the clue
    - Following crossword conventions
    
    Rank these clues from best to worst and explain why the winner is best. If no clue is up to standard, explain why and state None as the winner. 
    Reply in this exact format:
    WINNER: [model name]
    WINNING_CLUE: [the winning clue]
    EXPLANATION: [1-2 sentence explanation]"""

    try:
        message = anthropic_client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=1024,
            messages=[
                {"role": "user", "content": evaluation_prompt}
            ]
        )
        response = message.content[0].text if isinstance(message.content, list) else message.content
        
        # Parse response
        lines = response.split('\n')
        winner = next(line for line in lines if line.startswith('WINNER:')).replace('WINNER:', '').strip()
        winning_clue = next(line for line in lines if line.startswith('WINNING_CLUE:')).replace('WINNING_CLUE:', '').strip()
        explanation = next(line for line in lines if line.startswith('EXPLANATION:')).replace('EXPLANATION:', '').strip()
        
        return winner, winning_clue, explanation
    except Exception as e:
        return "Error", "Error in evaluation", str(e)

def generate_comparative_clues(df, start_from=0, num_words=10):
    """Generate and compare clues from all three models, with automated evaluation"""
    # Initialize all clients
    genai.configure(api_key=os.environ["GEMINI_API_KEY"])
    gemini_model = genai.GenerativeModel("gemini-1.5-pro")
    anthropic_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
    openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    
    # Initialize results list based on checkpoint or fresh start
    if start_from == 0:
        results = []
    else:
        # Read checkpoint file and convert to list of dictionaries
        checkpoint_df = pd.read_csv(f'model_clue_comparison_with_evaluation_checkpoint_{start_from}.csv')
        # Drop any unnamed index columns that might have been created
        unnamed_cols = [col for col in checkpoint_df.columns if 'Unnamed' in col]
        checkpoint_df = checkpoint_df.drop(columns=unnamed_cols)
        results = checkpoint_df.to_dict('records')

    for i in tqdm(range(start_from+1, min(num_words, len(df)))):
        word = df.iloc[i].word
        
        # Use same style for all models for fair comparison
        style_prompt, style_type = get_style_prompt()
        
        try:
            # Generate clues from all models
            clues = {
                'Gemini': generate_gemini_clue(word, gemini_model, style_prompt),
                'Claude': generate_anthropic_clue(word, anthropic_client, style_prompt),
                'GPT-4o': generate_openai_clue(word, openai_client, style_prompt)
            }
            
            # Have Claude evaluate the clues
            winner, winning_clue, explanation = evaluate_clues(word, style_type, clues, anthropic_client)
            
            results.append({
                'Word': word,
                'Clue Type': style_type,
                'Gemini': clues['Gemini'],
                'Claude': clues['Claude'],
                'GPT-4o': clues['GPT-4o'],
                'Winning Model': winner,
                'Winning Clue': winning_clue,
                'Explanation': explanation
            })

            # Save checkpoint at regular intervals or specific milestones
            if (i % 1000 == 0):
                results_df = pd.DataFrame(results)
                results_df.to_csv(f'model_clue_comparison_with_evaluation_checkpoint_{i}.csv', index=False)
                print(f"Saved checkpoint at iteration {i}")
            
            # Display progress update
            if (i % 50 == 0) or (i == (start_from +1)):
                results_df = pd.DataFrame(results)
                print(results_df.tail())
                results_df.to_csv(f'model_clue_comparison_with_evaluation_checkpoint_{i}.csv', index=False)
                print(f"Saved checkpoint at iteration {i}")

        except Exception as e:
            print(f"Error generating clues for word '{word}' at index {i}: {str(e)}")
            continue
    
    # Convert final results to DataFrame and save
    results_df = pd.DataFrame(results)
    results_df.to_csv('model_clue_comparison_with_evaluation.csv', index=False)
    
    return results_df

results_df = generate_comparative_clues(df, start_from=0, num_words=1)