In [5]:
!pip install requests wikipedia-api sentence-transformers torch pandas nltk notebook


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
import pandas as pd

csv_path = "nytcrosswords.csv"
df = pd.read_csv(csv_path, encoding="ISO-8859-1")
df = df[['Clue', 'Word']].dropna()  # Keep only relevant columns
print(f"Dataset Loaded: {len(df)} clues")

Dataset Loaded: 781539 clues


In [12]:
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import wordnet
import nltk

# Load a pre-trained BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Ensure nltk dependencies are available
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sheryldeakin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sheryldeakin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
def get_synonyms(word):
    """ Get a list of synonyms and related terms for a given word using WordNet. """
    synonyms = set([word])  # Include the original word
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            clean_word = lemma.name().replace("_", " ")  # Replace underscores with spaces
            synonyms.add(clean_word)
    
    return list(synonyms)

def get_bert_synonyms(word, top_n=10):
    """ Expand words using BERT similarity search. """
    candidate_words = get_synonyms(word)

    if not candidate_words:  # If no synonyms found, return just the original word
        return [word]

    # Compute BERT embeddings for input word
    word_embedding = bert_model.encode(word, convert_to_tensor=True)
    
    # Compute embeddings for candidate words
    candidate_embeddings = bert_model.encode(candidate_words, convert_to_tensor=True)

    # Compute cosine similarity between input word and candidate words
    similarities = util.pytorch_cos_sim(word_embedding, candidate_embeddings)

    top_n = min(top_n, len(candidate_words))  # Adjust `top_n` dynamically
    if top_n == 0:
        return [word]  # If no candidates are found, return the original word
    # Get top N most similar words
    top_indices = torch.topk(similarities, top_n).indices.tolist()[0]
    
    # Return top similar words
    return [candidate_words[i] for i in top_indices]


In [14]:
def extract_keywords(sentence):
    """ Extract words from the sentence and find their synonyms and related terms. """
    words = re.findall(r'\b\w+\b', sentence)  # Extract words
    expanded_keywords = set()
    for word in words:
        wordnet_synonyms = get_synonyms(word)  # WordNet synonyms
        bert_synonyms = get_bert_synonyms(word)  # BERT synonyms
        expanded_keywords.update(wordnet_synonyms + bert_synonyms)  # Merge both
    return list(expanded_keywords), words  # Return both expanded keywords and original words


In [15]:
def search_wikipedia_articles(keywords, original_words):
    """ 
    Search Wikipedia for articles containing the keywords.
    """
    wiki = wikipediaapi.Wikipedia(
        user_agent="Project/1.0 (contact: your-email@example.com)",  # Replace with your details
        language="en"
    )
    results = {}

    # Use multiple keywords together for better search results
    search_query = " ".join(keywords[:10])  # Use up to 10 keywords for broader search

    search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={search_query}&srlimit=50&format=json"
    response = requests.get(search_url, headers={"User-Agent": "MindStormBot/1.0 (contact: deakin.s@northeastern.edu)"}, timeout=30) 
    
    if response.status_code == 200:
        data = response.json()
        search_results = data.get("query", {}).get("search", [])
        for entry in search_results:
            page_title = entry["title"]

            # Exclude pages where the title is an EXACT match to an original input word
            if any(word.lower() == page_title.lower() for word in original_words):
                continue

            page = wiki.page(page_title)
            if page.exists():
                page_text = page.text.lower()

                # Ensure at least one original word or synonym appears
                word_matches = [word.lower() in page_text for word in original_words]
                if sum(word_matches) >= 1:  # Allow pages with at least one match
                    results[page_title] = page.text  # Store relevant pages

    else:
        print(f"Error fetching Wikipedia results for query: {search_query}")

    return results


In [16]:
def merge_results(expanded_results, original_results):
    """ Merge two result dictionaries, avoiding duplicates. """
    combined_results = {**expanded_results, **original_results}  # Merge without duplicates
    return combined_results


In [17]:
def count_word_frequencies(text, keywords):
    """ Count occurrences of keywords (words & synonyms) in Wikipedia text. """
    words = re.findall(r'\b\w+\b', text.lower())  # Tokenize Wikipedia text
    word_counts = Counter(words)
    return {word: word_counts[word.lower()] for word in keywords if word.lower() in word_counts}


In [20]:
def test_wikipedia_search_all(output_file="wikipedia_search_results_full.csv", batch_size=1000):
    """ 
    Test Wikipedia search on all crossword clues.
    Saves results in batches to avoid losing progress.
    """
    
    results = []  # Store evaluation results
    total_clues = len(df)

    for index, row in df.iterrows():
        clue = row["Clue"]
        answer = row["Word"].strip().lower()  # Normalize the answer

        print(f"\n🔍 ({index+1}/{total_clues}) Searching Wikipedia for: {clue} (Expected answer: {answer})")
        
        # Extract keywords using BERT and WordNet
        keywords, original_words = extract_keywords(clue)
        
        # Perform Wikipedia search
        expanded_results = search_wikipedia_articles(keywords, original_words)
        original_results = search_wikipedia_articles(original_words, original_words)

        # Merge results
        wiki_results = merge_results(expanded_results, original_results)

        # Initialize match conditions
        title_match = False
        content_match = False
        matched_page_title = "No Title Match"  # Default title if no match

        # Check if the answer appears in the Wikipedia page **title** or **content**
        for page_title, page_text in wiki_results.items():
            if answer in page_title.lower():  
                title_match = True  # Answer found in title
                matched_page_title = page_title  # Store the matching title
            
            if answer in page_text.lower():
                content_match = True  # Answer found in Wikipedia content
                
                # If no title match, update the matched page title with content match
                if not title_match:
                    matched_page_title = page_title  

            # Stop early if both conditions are met
            if title_match and content_match:
                break  
        
        # Store the result
        results.append({
            "Clue": clue,
            "Expected Answer": answer,
            "Wikipedia Page Title": matched_page_title,  # Store title if found in content
            "Title Match": title_match,
            "Content Match": content_match
        })

        # Save every `batch_size` samples to prevent data loss
        if (index + 1) % batch_size == 0:
            temp_df = pd.DataFrame(results)
            temp_df.to_csv(output_file, index=False, encoding="utf-8")
            print(f"✅ Progress saved: {index + 1}/{total_clues} clues processed.")

    # Final save
    final_df = pd.DataFrame(results)
    final_df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"\n📁 Final results saved to: {output_file}")

    # Display the results in Jupyter Notebook
    import ace_tools as tools
    tools.display_dataframe_to_user(name="Wikipedia Search Results", dataframe=final_df)

In [24]:
import re
import torch
import requests
import wikipediaapi
import pandas as pd

test_wikipedia_search_all(output_file="nyt_crossword_wiki_results.csv")


🔍 (1/781539) Searching Wikipedia for: Action done while saying "Good dog" (Expected answer: pat)

🔍 (2/781539) Searching Wikipedia for: Mischief-makers (Expected answer: rascals)

🔍 (3/781539) Searching Wikipedia for: It might click for a writer (Expected answer: pen)

🔍 (4/781539) Searching Wikipedia for: Fall mo. (Expected answer: sep)

🔍 (5/781539) Searching Wikipedia for: Kind to Mother Nature (Expected answer: eco)

🔍 (6/781539) Searching Wikipedia for: Harris in the Country Music Hall of Fame (Expected answer: emmylou)

🔍 (7/781539) Searching Wikipedia for: Living ___ (Expected answer: wage)

🔍 (8/781539) Searching Wikipedia for: Member of the superfamily Hominoidea (Expected answer: ape)

🔍 (9/781539) Searching Wikipedia for: Haphazard (Expected answer: hitormiss)

🔍 (10/781539) Searching Wikipedia for: Some crumbly blocks (Expected answer: feta)

🔍 (11/781539) Searching Wikipedia for: Inits. for a theatrical hit (Expected answer: sro)

🔍 (12/781539) Searching Wikipedia for: Se

ReadTimeout: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)