In [25]:
pip install requests beautifulsoup4 pandas

Collecting pandas
  Downloading pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.0.1-cp311-cp311-macosx_14_0_arm64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-

In [23]:
import requests
from bs4 import BeautifulSoup
import csv

def fetch_season_urls(base_url):
    """Fetch all season URLs from the J-Archive main page."""
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    season_links = soup.select('a[href^="showseason.php?season="]')
    return [base_url + '/' + link['href'] for link in season_links]

def fetch_episode_urls(season_url):
    """Fetch all episode URLs from a season page."""
    response = requests.get(season_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    episode_links = soup.select('a[href^="showgame.php?game_id="]')
    return [season_url.rsplit('/', 1)[0] + '/' + link['href'] for link in episode_links]

def parse_episode_page(url):
    """Parse an episode page to extract categories, questions, and answers."""
    data = []
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for HTTP errors
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all tables with the class "round"
    round_tables = soup.select('table.round')

    for table in round_tables:
        # Extract category names from the first row of the table
        headers = table.select('tr')[0].select('td.category_name')
        categories = [header.get_text(strip=True) for header in headers]

        # Extract clues and answers from the remaining rows
        rows = table.select('tr')[1:]
        for row in rows:
            cells = row.select('td.clue')
            for idx, cell in enumerate(cells):
                try:
                    clue_value = cell.select_one('td.clue_value').get_text(strip=True) if cell.select_one('td.clue_value') else cell.select_one('td.clue_value_daily_double').get_text(strip=True) if cell.select_one('td.clue_value_daily_double') else ''
                    clue_text = cell.select_one('.clue_text').get_text(strip=True) if cell.select_one('.clue_text') else ''
                    clue_answer = cell.select_one('.correct_response').get_text(strip=True) if cell.select_one('.correct_response') else ''
                    category_name = categories[idx] if idx < len(categories) else 'Unknown'
                    data.append([category_name, clue_value, clue_text, clue_answer])
                except Exception as e:
                    print(f"Error processing clue in row {row}: {e}")

    return data

def save_to_csv(data, filename):
    """Save the extracted data to a CSV file."""
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Season', 'Episode', 'Category', 'Value', 'Question', 'Answer'])
        writer.writerows(data)

def main():
    base_url = 'https://www.j-archive.com'
    season_urls = fetch_season_urls(base_url)
    
    all_data = []
    
    for season_url in season_urls[:1]:
        print(f"Processing {season_url}")
        episode_urls = fetch_episode_urls(season_url)
        
        # Extract season number for the CSV
        season_number = season_url.split('=')[-1]
        
        for episode_url in episode_urls:
            print(f"  Processing {episode_url}")
            episode_data = parse_episode_page(episode_url)
            
            # Extract episode number from URL
            episode_number = episode_url.split('=')[-1].split('.')[0]
            
            # Add season and episode info to each row
            for row in episode_data:
                all_data.append([season_number, episode_number] + row)
    
    save_to_csv(all_data, 'jeopardy_data.csv')
    print("Data has been saved to jeopardy_data.csv")

if __name__ == "__main__":
    main()

Processing https://www.j-archive.com/showseason.php?season=40
  Processing https://www.j-archive.com/showgame.php?game_id=8998
  Processing https://www.j-archive.com/showgame.php?game_id=8997
  Processing https://www.j-archive.com/showgame.php?game_id=8996
  Processing https://www.j-archive.com/showgame.php?game_id=8995
  Processing https://www.j-archive.com/showgame.php?game_id=8994
  Processing https://www.j-archive.com/showgame.php?game_id=8993
  Processing https://www.j-archive.com/showgame.php?game_id=8992
  Processing https://www.j-archive.com/showgame.php?game_id=8991
  Processing https://www.j-archive.com/showgame.php?game_id=8990
  Processing https://www.j-archive.com/showgame.php?game_id=8989
  Processing https://www.j-archive.com/showgame.php?game_id=8988
  Processing https://www.j-archive.com/showgame.php?game_id=8987
  Processing https://www.j-archive.com/showgame.php?game_id=8986
  Processing https://www.j-archive.com/showgame.php?game_id=8985
  Processing https://www.j-a

In [27]:
import pandas as pd

# Load the data from the CSV file
jeopardy_data = pd.read_csv('jeopardy_data.csv')

# Extract the distinct values in the 'categories' column
distinct_categories = jeopardy_data['Category'].unique()

# Create a new DataFrame for the distinct categories
valid_categories_df = pd.DataFrame(distinct_categories, columns=['valid categories'])

# Save the distinct categories to a new CSV file
valid_categories_df.to_csv('valid_categories.csv', index=False)

print("Valid categories have been saved to 'all_categories.csv'")

Valid categories have been saved to 'valid_categories.csv'


In [4]:
import pandas as pd
import os

# Detect the platform for clearing the console
clear_command = 'cls' if os.name == 'nt' else 'clear'

# Load the data from the CSV file
categories_df = pd.read_csv('all_categories.csv')

# Initialize an empty DataFrame for valid categories
valid_categories_df = pd.DataFrame(columns=['valid categories'])

# Initialize the index
index = 0

while index < len(categories_df):
    category = categories_df.iloc[index]['valid categories']
    print(f"Category: {category}")

    # Wait for user input
    user_input = input("Press Enter to keep, 'd' and Enter to delete, or 'b' and Enter to go back: ").strip().lower()

    # Clear the screen
    os.system(clear_command)

    if user_input == 'd':
        # If the user presses 'd', skip the current category
        print(f"Deleted category: {category}")
        index += 1
    elif user_input == 'b' and index > 0:
        # If the user presses 'b', go back to the previous category
        index -= 1
        print("Going back to the previous category...")
    else:
        # If the user presses Enter, add the category to the valid list and move to the next one
        valid_categories_df = pd.concat([valid_categories_df, pd.DataFrame({'valid categories': [category]})], ignore_index=True)
        valid_categories_df.to_csv('valid_categories.csv', index=False)
        index += 1

# Final save to ensure all valid categories are written to the file
valid_categories_df.to_csv('valid_categories.csv', index=False)

print("Finished processing. The valid categories have been saved to 'valid_categories.csv'.")

Category: A DATE IN HISTORY
[H[2JDeleted category: A DATE IN HISTORY
Category: IDIOMS & EXPRESSIONS
[H[2JCategory: A WORD FROM YOUR DOCTOR
[H[2JDeleted category: A WORD FROM YOUR DOCTOR
Category: ALSO A FISHING TERM
[H[2JDeleted category: ALSO A FISHING TERM
Category: A THING FOR FEATS
[H[2JDeleted category: A THING FOR FEATS
Category: WE CRACK OURSELVES UP
[H[2JCategory: CANALS
[H[2JDeleted category: CANALS
Category: "I" + 4
[H[2JDeleted category: "I" + 4
Category: TOO MUCH OF SOMETHING
[H[2JDeleted category: TOO MUCH OF SOMETHING
Category: TICK TOCK
[H[2JDeleted category: TICK TOCK
Category: FACE BOOKS
[H[2JDeleted category: FACE BOOKS
Category: INSTA-GRAHAM
[H[2JDeleted category: INSTA-GRAHAM
Category: FACTS ABOUT WORLD WAR I
[H[2JDeleted category: FACTS ABOUT WORLD WAR I
Category: THEY COME IN PAIRS
[H[2JDeleted category: THEY COME IN PAIRS
Category: WORDS IN A DICKENS TITLE
[H[2JDeleted category: WORDS IN A DICKENS TITLE
Category: PUTTING THE "T" IN T