In [1]:
pip install requests beautifulsoup4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import requests
from bs4 import BeautifulSoup
import csv

def fetch_season_urls(base_url):
    """Fetch all season URLs from the J-Archive main page."""
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    season_links = soup.select('a[href^="showseason.php?season="]')
    return [base_url + '/' + link['href'] for link in season_links]

def fetch_episode_urls(season_url):
    """Fetch all episode URLs from a season page."""
    response = requests.get(season_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    episode_links = soup.select('a[href^="showgame.php?game_id="]')
    return [season_url.rsplit('/', 1)[0] + '/' + link['href'] for link in episode_links]

def parse_episode_page(url):
    """Parse an episode page to extract categories, questions, and answers."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    categories = soup.select('.category_name')
    clues = soup.select('.clue')
    
    data = []
    for category in categories:
        category_name = category.text.strip()
        clues_for_category = clues[0:6]  # Assuming each category has 6 clues
        clues = clues[6:]  # Move to next set of clues
        
        for clue in clues_for_category:
            try:
                clue_text = clue.select_one('.clue_text').text.strip()
                clue_answer = clue.select_one('.correct_response').text.strip()
                data.append([category_name, clue_text, clue_answer])
            except:
                None    
    
    return data

def save_to_csv(data, filename):
    """Save the extracted data to a CSV file."""
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Season', 'Episode', 'Category', 'Question', 'Answer'])
        writer.writerows(data)

def main():
    base_url = 'https://www.j-archive.com'
    season_urls = fetch_season_urls(base_url)
    
    all_data = []
    
    for season_url in season_urls[:1]:
        print(f"Processing {season_url}")
        episode_urls = fetch_episode_urls(season_url)
        
        # Extract season number for the CSV
        season_number = season_url.split('=')[-1]
        
        for episode_url in episode_urls[:1]:
            print(f"  Processing {episode_url}")
            episode_data = parse_episode_page(episode_url)
            
            # Extract episode number from URL
            episode_number = episode_url.split('=')[-1].split('.')[0]
            
            # Add season and episode info to each row
            for row in episode_data:
                all_data.append([season_number, episode_number] + row)
    
    save_to_csv(all_data, 'jeopardy_data.csv')
    print("Data has been saved to jeopardy_data.csv")

if __name__ == "__main__":
    main()

Processing https://www.j-archive.com/showseason.php?season=40
  Processing https://www.j-archive.com/showgame.php?game_id=8998
  Processing https://www.j-archive.com/showgame.php?game_id=8997
  Processing https://www.j-archive.com/showgame.php?game_id=8996
  Processing https://www.j-archive.com/showgame.php?game_id=8995
  Processing https://www.j-archive.com/showgame.php?game_id=8994
  Processing https://www.j-archive.com/showgame.php?game_id=8993
  Processing https://www.j-archive.com/showgame.php?game_id=8992
  Processing https://www.j-archive.com/showgame.php?game_id=8991
  Processing https://www.j-archive.com/showgame.php?game_id=8990
  Processing https://www.j-archive.com/showgame.php?game_id=8989
  Processing https://www.j-archive.com/showgame.php?game_id=8988
  Processing https://www.j-archive.com/showgame.php?game_id=8987
  Processing https://www.j-archive.com/showgame.php?game_id=8986
  Processing https://www.j-archive.com/showgame.php?game_id=8985
  Processing https://www.j-a