In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:

# URL for WNBA boxscores
url = "https://www.basketball-reference.com/wnba/boxscores/"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the section containing the boxscores
    boxscores = soup.find_all('div', class_='game_summary expanded nohover')
    
    # Iterate through each boxscore and extract relevant data
    for boxscore in boxscores:
        teams = boxscore.find_all('td', class_='right gamelink')
        for team in teams:
            team_name = team.find('a').text
            print(f"Team: {team_name}")
        
        # Add more extraction logic as needed
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

# You can add further processing and data extraction as needed



In [13]:
class WebCrawler:
    def __init__(self, url):
        self.url = url
        self.html_content = None
        self.soup = None

    def fetch_html(self):
        """Fetches HTML content from the specified URL."""
        response = requests.get(self.url)
        if response.status_code == 200:
            self.html_content = response.text
            self.soup = BeautifulSoup(self.html_content, 'html.parser')
            print("HTML content fetched successfully.")
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")

    def save_html_to_file(self, filename):
        """Saves the fetched HTML content to a file."""
        if self.html_content:
            with open(filename, "w", encoding='utf-8') as file:
                file.write(self.html_content)
            print(f"HTML content saved to {filename}")
        else:
            print("No HTML content to save. Please fetch the HTML content first.")

    def print_html(self):
        """Prints the fetched HTML content to the console."""
        if self.html_content:
            print(self.html_content)
        else:
            print("No HTML content to print. Please fetch the HTML content first.")

    def extract_csk_ids(self):
        """Extracts all 'csk' attributes from the specified elements."""
        if not self.soup:
            print("No HTML content parsed. Please fetch the HTML content first.")
            return []

        # Find all <th> elements with the specified attributes
        th_elements = self.soup.find_all('th', {'scope': 'row', 'class': 'left ', 'data-stat': 'date_game'})

        # Extract the 'csk' attribute values
        csk_ids = [th['csk'] for th in th_elements if 'csk' in th.attrs]
        
        return csk_ids

# Example usage
if __name__ == "__main__":
    # URL for WNBA games
    url = "https://www.basketball-reference.com/wnba/years/2024_games.html"

    # Create an instance of WebCrawler
    crawler = WebCrawler(url)

    # Fetch HTML content
    crawler.fetch_html()

    # Save HTML content to a file
    crawler.save_html_to_file("wnba_2024_games.html")

    # Extract and print all 'csk' ids
    csk_ids = crawler.extract_csk_ids()
    print("Extracted 'csk' IDs:", csk_ids)


HTML content fetched successfully.
HTML content saved to wnba_2024_games.html
Extracted 'csk' IDs: []
