In [1]:
# Install required libraries (uncomment and run if needed)
# !pip install beautifulsoup4 selenium undetected-chromedriver

import os
import time
import json
from bs4 import BeautifulSoup
import undetected_chromedriver as uc


In [2]:
# Define data directories
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'data')
PROBLEMS_DIR = os.path.join(DATA_DIR, 'problems')
METADATA_DIR = os.path.join(DATA_DIR, 'metadata')
EDITORIALS_DIR = os.path.join(DATA_DIR, 'editorials')

# Ensure directories exist
os.makedirs(PROBLEMS_DIR, exist_ok=True)
os.makedirs(METADATA_DIR, exist_ok=True)
os.makedirs(EDITORIALS_DIR, exist_ok=True)

print("Directories set up successfully!")


Directories set up successfully!


In [5]:
import requests

url = "https://codeforces.com/problemset"
response = requests.get(url)

if response.status_code == 200:
    with open("Codeforces_Problemset.html", "w", encoding="utf-8") as file:
        file.write(response.text)
    print("HTML content saved successfully.")
else:
    print(f"Failed to fetch HTML. Status code: {response.status_code}")


Failed to fetch HTML. Status code: 403


In [6]:
# Install undetected-chromedriver
!pip install undetected-chromedriver

# Import necessary libraries
import undetected_chromedriver.v2 as uc
from selenium.webdriver.common.by import By

# ... (Rest of your code remains the same) ...

# Set up undetected ChromeDriver
options = uc.ChromeOptions()
options.add_argument("--headless")  # Run Chrome in headless mode (no GUI)
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

# Initialize undetected ChromeDriver
driver = uc.Chrome(options=options)

# Target URL
url = "https://codeforces.com/problemset"

try:
    # Open the URL
    driver.get(url)
    
    # Wait for the page to load (optional)
    driver.implicitly_wait(5)

    # Get the HTML source
    html_content = driver.page_source

    # Save the HTML content to a file
    file_path = "Codeforces_Problemset.html"
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(html_content)

    print(f"HTML content saved successfully to {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the browser
    driver.quit()


Defaulting to user installation because normal site-packages is not writeable


ModuleNotFoundError: No module named 'undetected_chromedriver.v2'

In [7]:
from bs4 import BeautifulSoup
import json
import os
import time
import undetected_chromedriver as uc
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

def save_to_json(data_entry, contest_id, part_id):
    if os.path.isfile('data.json'):
        with open('data.json', 'r') as json_file:
            existing_data = json.load(json_file)
    else:
        existing_data = {}

    existing_data[f"{contest_id}{part_id}"] = data_entry

    with open('data.json', 'w') as json_file:
        json.dump(existing_data, json_file, indent=4)

def extract_solution(page_url, driver_instance, editorial_page, contest_id, part_id):
    driver_instance.get(editorial_page)
    time.sleep(2)
    
    try:
        WebDriverWait(driver_instance, 6).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'ttypography'))
        )
    except Exception:
        driver_instance.refresh()
        WebDriverWait(driver_instance, 6).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'ttypography'))
        )

    parsed_editorial = BeautifulSoup(driver_instance.page_source, "html.parser")
    
    # Logic to find the specific solution
    return find_specific_solution(parsed_editorial, contest_id, part_id)

def find_specific_solution(editorial_soup, contest_id, part_id):
    solutions_text = editorial_soup.find('div', class_='ttypography').get_text(strip=False)
    start_index = solutions_text.find(f"{contest_id}{part_id}")

    if start_index == -1:
        # Handling different formats for problem identification
        possible_formats = [
            f"{contest_id} {part_id}",
            f"{contest_id}-{part_id}",
            f"{part_id} {contest_id}",
            f"{part_id}-{contest_id}"
        ]
        for format in possible_formats:
            start_index = solutions_text.find(format)
            if start_index != -1:
                break

    if start_index != -1:
        end_index = solutions_text.find(contest_id, start_index + 1)
        return solutions_text[start_index:end_index] if end_index != -1 else solutions_text[start_index:]

    return None

def scrape_codeforces(problem_url, driver_instance):
    try:
        driver_instance.get(problem_url)
        time.sleep(2)
        
        WebDriverWait(driver_instance, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'problem-statement'))
        )

        page_soup = BeautifulSoup(driver_instance.page_source, "html.parser")
        question_title = page_soup.find('div', class_='title').text.strip()
        
        # Extracting description
        description_tags = page_soup.find('div', class_='problem-statement').find_all('p')

        clean_description = []
        for tag in description_tags:
            for unwanted in tag.find_all(['script', 'math']):
                unwanted.decompose()
            clean_description.append(tag.get_text(strip=False))

        description = '\n'.join(clean_description)

        tags = [tag.text.strip() for tag in page_soup.find_all('span', class_='tag-box')]
        time_constraint = page_soup.find(class_="time-limit").get_text(strip=True)
        memory_constraint = page_soup.find(class_="memory-limit").get_text(strip=True)

        parts = problem_url.split('/')
        part_id = parts[-1]
        contest_id = parts[-2]

        editorial_links = []
        for a_tag in page_soup.find_all('a', href=True): 
            title_text = a_tag.get('title', '').lower()
            link_text = a_tag.text.lower()

            if any(re.search(r'(?<!video\s)(tut(orials?)?)', s, re.IGNORECASE) for s in [title_text, link_text]) or \
               any(re.search(r'(?<!video\s)(edit(orials?)?)', s, re.IGNORECASE) for s in [title_text, link_text]):
                editorial_links.append(a_tag['href'])
        
        found_solution = "Solution not available"
        
        for link in editorial_links:
            editorial_url = f"https://codeforces.com{link}" if link.startswith("https://codeforces.com") else link
            try:
                found_solution = extract_solution(problem_url, driver_instance, editorial_url, contest_id, part_id)
                if found_solution:
                    break
            except Exception:
                found_solution = "Solution not found"

            time.sleep(2)

        output_data = {
            'title': question_title,
            'description': description,
            'time_limit': time_constraint,
            'memory_limit': memory_constraint,
            'tags': tags,
            'solution': found_solution
        }
        
        save_to_json(output_data, contest_id, part_id)

    finally:
        pass

def execute_scraping():
    chrome_driver = uc.Chrome()
    processed_links = set()
    try:
        for page_num in range(1, 8):  # Extracting 7 pages
            problem_set_url = f"https://codeforces.com/problemset/page/{page_num}?tags=1000-1200"
            chrome_driver.get(problem_set_url)
            WebDriverWait(chrome_driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'problems'))
            )

            problem_set_soup = BeautifulSoup(chrome_driver.page_source, "html.parser")
            problem_links = problem_set_soup.find_all('a', href=True)

            for link in problem_links:
                if link['href'].startswith("/problemset/problem"):
                    problem_url = f"https://codeforces.com{link['href']}"
                    if problem_url in processed_links:
                        continue
                    
                    print(problem_url)
                    chrome_driver.get(problem_url)
                    WebDriverWait(chrome_driver, 7).until(
                        EC.presence_of_element_located((By.CLASS_NAME, 'problem-statement'))
                    )
                    processed_links.add(problem_url)
                    scrape_codeforces(problem_url, chrome_driver)
                    time.sleep(2)
    finally:
        chrome_driver.quit()

if __name__ == "__main__":
    execute_scraping()


https://codeforces.com/problemset/problem/2050/C
https://codeforces.com/problemset/problem/2050/B
https://codeforces.com/problemset/problem/2048/C
https://codeforces.com/problemset/problem/2046/A
https://codeforces.com/problemset/problem/2044/D
https://codeforces.com/problemset/problem/2041/E
https://codeforces.com/problemset/problem/2041/B
https://codeforces.com/problemset/problem/2040/B
https://codeforces.com/problemset/problem/2039/C1
https://codeforces.com/problemset/problem/2039/B
https://codeforces.com/problemset/problem/2037/C
https://codeforces.com/problemset/problem/2036/C
https://codeforces.com/problemset/problem/2034/B
https://codeforces.com/problemset/problem/2032/B
https://codeforces.com/problemset/problem/2030/C
https://codeforces.com/problemset/problem/2029/B
https://codeforces.com/problemset/problem/2027/B
https://codeforces.com/problemset/problem/2025/B
https://codeforces.com/problemset/problem/2024/B
https://codeforces.com/problemset/problem/2021/B
https://codeforces.

TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x0124EC13+23731]
	(No symbol) [0x011DC394]
	(No symbol) [0x010BBE63]
	(No symbol) [0x010FFCE6]
	(No symbol) [0x010FFF2B]
	(No symbol) [0x0113D892]
	(No symbol) [0x01121EA4]
	(No symbol) [0x0113B46E]
	(No symbol) [0x01121BF6]
	(No symbol) [0x010F3F35]
	(No symbol) [0x010F4EBD]
	GetHandleVerifier [0x0152F0D3+3039603]
	GetHandleVerifier [0x01542DEA+3120778]
	GetHandleVerifier [0x0153B592+3089970]
	GetHandleVerifier [0x012E43B0+635984]
	(No symbol) [0x011E4DCD]
	(No symbol) [0x011E2068]
	(No symbol) [0x011E2205]
	(No symbol) [0x011D4FD0]
	BaseThreadInitThunk [0x761F5D49+25]
	RtlInitializeExceptionChain [0x7712CEBB+107]
	RtlGetAppContainerNamedObjectPath [0x7712CE41+561]


In [8]:
from bs4 import BeautifulSoup
import json
import os
import time
import undetected_chromedriver as uc
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

def save_to_json(data_entry, contest_id, part_id):
    if os.path.isfile('data.json'):
        with open('data.json', 'r') as json_file:
            existing_data = json.load(json_file)
    else:
        existing_data = {}

    existing_data[f"{contest_id}{part_id}"] = data_entry

    with open('data.json', 'w') as json_file:
        json.dump(existing_data, json_file, indent=4)

def extract_solution(page_url, driver_instance, editorial_page, contest_id, part_id):
    driver_instance.get(editorial_page)
    time.sleep(2)
    
    try:
        WebDriverWait(driver_instance, 15).until(
            EC.visibility_of_element_located((By.CLASS_NAME, 'ttypography'))
        )
    except TimeoutException:
        print(f"Timeout while loading editorial page: {editorial_page}.")
        return "Solution not found."

    parsed_editorial = BeautifulSoup(driver_instance.page_source, "html.parser")
    return find_specific_solution(parsed_editorial, contest_id, part_id)

def find_specific_solution(editorial_soup, contest_id, part_id):
    solutions_text = editorial_soup.find('div', class_='ttypography').get_text(strip=False)
    start_index = solutions_text.find(f"{contest_id}{part_id}")

    if start_index == -1:
        # Checking other possible formats for finding the problem statement
        possible_formats = [
            f"{contest_id} {part_id}",
            f"{contest_id}-{part_id}",
            f"{part_id} {contest_id}",
            f"{part_id}-{contest_id}"
        ]
        for format in possible_formats:
            start_index = solutions_text.find(format)
            if start_index != -1:
                break

    if start_index != -1:
        end_index = solutions_text.find(contest_id, start_index + 1)
        return solutions_text[start_index:end_index] if end_index != -1 else solutions_text[start_index:]

    return "Solution not found."

def scrape_codeforces(problem_url, driver_instance):
    try:
        driver_instance.get(problem_url)
        time.sleep(2)
        
        try:
            WebDriverWait(driver_instance, 15).until(
                EC.visibility_of_element_located((By.CLASS_NAME, 'problem-statement'))
            )
        except TimeoutException:
            print(f"Timeout while waiting for problem statement on: {problem_url}. Refreshing...")
            driver_instance.refresh()
            WebDriverWait(driver_instance, 15).until(
                EC.visibility_of_element_located((By.CLASS_NAME, 'problem-statement'))
            )

        page_soup = BeautifulSoup(driver_instance.page_source, "html.parser")
        question_title = page_soup.find('div', class_='title').text.strip()

        # Extracting problem description
        description_tags = page_soup.find('div', class_='problem-statement').find_all('p')

        clean_description = []
        for tag in description_tags:
            for unwanted in tag.find_all(['script', 'math']):
                unwanted.decompose()
            clean_description.append(tag.get_text(strip=False))

        description = '\n'.join(clean_description)

        # Extracting tags
        tags = [tag.text.strip() for tag in page_soup.find_all('span', class_='tag-box')]
        time_constraint = page_soup.find(class_="time-limit").get_text(strip=True)
        memory_constraint = page_soup.find(class_="memory-limit").get_text(strip=True)

        parts = problem_url.split('/')
        part_id = parts[-1]
        contest_id = parts[-2]

        editorial_links = []
        for a_tag in page_soup.find_all('a', href=True): 
            title_text = a_tag.get('title', '').lower()
            link_text = a_tag.text.lower()

            if any(re.search(r'(?<!video\s)(tut(orials?)?)', s, re.IGNORECASE) for s in [title_text, link_text]) or \
               any(re.search(r'(?<!video\s)(edit(orials?)?)', s, re.IGNORECASE) for s in [title_text, link_text]):
                editorial_links.append(a_tag['href'])
        
        found_solution = "Solution not available"
        
        for link in editorial_links:
            editorial_url = f"https://codeforces.com{link}" if link.startswith("https://codeforces.com") else link
            try:
                found_solution = extract_solution(problem_url, driver_instance, editorial_url, contest_id, part_id)
                if found_solution and found_solution != "Solution not found.":
                    break
            except Exception as e:
                print(f"Error while extracting solution from {editorial_url}: {str(e)}")

            time.sleep(2)

        output_data = {
            'title': question_title,
            'description': description,
            'time_limit': time_constraint,
            'memory_limit': memory_constraint,
            'tags': tags,
            'solution': found_solution
        }
        
        save_to_json(output_data, contest_id, part_id)

    except Exception as e:
        print(f"An error occurred while scraping {problem_url}: {str(e)}.")

def execute_scraping():
    chrome_driver = uc.Chrome()
    processed_links = set()
    try:
        for page_num in range(1, 8):  # Extracting 7 pages
            problem_set_url = f"https://codeforces.com/problemset/page/{page_num}?tags=1000-1200"
            print(f"Scraping problem set from: {problem_set_url}")
            chrome_driver.get(problem_set_url)
            WebDriverWait(chrome_driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'problems'))
            )

            problem_set_soup = BeautifulSoup(chrome_driver.page_source, "html.parser")
            problem_links = problem_set_soup.find_all('a', href=True)

            for link in problem_links:
                if link['href'].startswith("/problemset/problem"):
                    problem_url = f"https://codeforces.com{link['href']}"
                    if problem_url in processed_links:
                        continue
                    
                    print(f"Processing problem URL: {problem_url}")
                    scrape_codeforces(problem_url, chrome_driver)
                    processed_links.add(problem_url)
                    time.sleep(2)
    finally:
        chrome_driver.quit()

if __name__ == "__main__":
    execute_scraping()


Scraping problem set from: https://codeforces.com/problemset/page/1?tags=1000-1200
Processing problem URL: https://codeforces.com/problemset/problem/2050/C
Error while extracting solution from /blog/entry/137018: Message: invalid argument
  (Session info: chrome=131.0.6778.205)
Stacktrace:
	GetHandleVerifier [0x0102EC13+23731]
	(No symbol) [0x00FBC394]
	(No symbol) [0x00E9BCD9]
	(No symbol) [0x00E8CF21]
	(No symbol) [0x00E8B84B]
	(No symbol) [0x00E8BE2B]
	(No symbol) [0x00E9E85E]
	(No symbol) [0x00F1BE87]
	(No symbol) [0x00F01E5C]
	(No symbol) [0x00F1B46E]
	(No symbol) [0x00F01BF6]
	(No symbol) [0x00ED3F35]
	(No symbol) [0x00ED4EBD]
	GetHandleVerifier [0x0130F0D3+3039603]
	GetHandleVerifier [0x01322DEA+3120778]
	GetHandleVerifier [0x0131B592+3089970]
	GetHandleVerifier [0x010C43B0+635984]
	(No symbol) [0x00FC4DCD]
	(No symbol) [0x00FC2068]
	(No symbol) [0x00FC2205]
	(No symbol) [0x00FB4FD0]
	BaseThreadInitThunk [0x761F5D49+25]
	RtlInitializeExceptionChain [0x7712CEBB+107]
	RtlGetAppCon

KeyboardInterrupt: 