In [34]:
import requests
from bs4 import BeautifulSoup

# URL of the Codeforces problemset sorted by most solved
URL = "https://codeforces.com/problemset?order=BY_SOLVED_DESC"

# Send a GET request to the webpage
HEADERS = {"User-Agent": "Mozilla/5.0"}  # Prevent request blocking
response = requests.get(URL, headers=HEADERS)

# Check if the request was successful
if response.status_code != 200:
    print(f"Failed to fetch the webpage: {response.status_code}")
    exit()

# Parse the HTML content
soup = BeautifulSoup(response.text, "html.parser")

# Extract unique problem links
problem_links = {
    f"https://codeforces.com{link['href']}"
    for link in soup.find_all('a', href=True) 
    if 'problemset/problem/' in link['href']
}

# Convert to list and display the first 3 links
print(list(problem_links)[:3])

['https://codeforces.com/problemset/problem/1512/A', 'https://codeforces.com/problemset/problem/155/A', 'https://codeforces.com/problemset/problem/467/A']


In [46]:
def get_problem_statement(url):
    """Fetches and extracts the problem statement from a given Codeforces problem URL."""
    response = requests.get(url, headers=HEADERS)

    if response.status_code != 200:
        print(f"Failed to fetch {url}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    statement_div = soup.find("div", class_="problem-statement")

    if not statement_div:
        print(f"No problem statement found for {url}")
        return None

    # Extract and flatten all text
    return " ".join(statement_div.stripped_strings)


# Extract statements for the first 20 problems
problem_statements = {
    url: get_problem_statement(url) for url in list(problem_links)[:20]
}

# Display first few statements
# for url, statement in problem_statements.items():
    # print(f"🔹 {url}\n{statement[-300:-1]}...\n")  # Show first 300 chars

In [47]:
import os

# Create the directory if it doesn't exist
STATEMENTS_DIR = "scraped/statements"
os.makedirs(STATEMENTS_DIR, exist_ok=True)

def save_problem_statements(problem_statements):
    """Saves problem statements to files named as {id}-{letter}.txt"""
    for url, statement in list(problem_statements.items())[:20]:  # First 20 statements
        # Extract problem ID and letter from URL (e.g., "/problemset/problem/750/A")
        parts = url.split("/")
        problem_id, letter = parts[-2], parts[-1]
        filename = f"{problem_id}-{letter.lower()}.txt"
        
        # Save statement to file
        with open(os.path.join(STATEMENTS_DIR, filename), "w", encoding="utf-8") as file:
            file.write(statement)
        
        print(f"✅ Saved: {filename}")

# Save the first 20 problem statements
save_problem_statements(problem_statements)

✅ Saved: 1512-a.txt
✅ Saved: 155-a.txt
✅ Saved: 467-a.txt
✅ Saved: 443-a.txt
✅ Saved: 750-a.txt
✅ Saved: 1692-a.txt
✅ Saved: 1335-a.txt
✅ Saved: 58-a.txt
✅ Saved: 1807-a.txt
✅ Saved: 451-a.txt
✅ Saved: 1475-a.txt
✅ Saved: 1352-a.txt
✅ Saved: 405-a.txt
✅ Saved: 118-a.txt
✅ Saved: 228-a.txt
✅ Saved: 266-b.txt
✅ Saved: 112-a.txt
✅ Saved: 59-a.txt
✅ Saved: 271-a.txt
✅ Saved: 148-a.txt


In [44]:
### Does not work for now.

import requests
from bs4 import BeautifulSoup

def get_csrf_token(session, url):
    """Fetch the CSRF token from the problem status page."""
    response = session.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, "html.parser")
    token_tag = soup.find("meta", {"name": "X-Csrf-Token"})
    
    return token_tag["content"] if token_tag else None


def get_wrong_answer_submissions(problem_id, index="A"):
    """Fetches submission links for a specific Codeforces problem where the verdict is 'WRONG_ANSWER'."""
    
    # Base URLs
    base_url = f"https://codeforces.com/problemset/status/{problem_id}/problem/{index}"
    
    # Headers & Payload
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:134.0) Gecko/20100101 Firefox/134.0",
        "Referer": base_url,
        "Origin": "https://codeforces.com",
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {
        # "csrf_token": csrf_token,  # Use dynamically fetched token
        "action": "setupSubmissionFilter",
        "frameProblemIndex": index,
        "verdictName": "WRONG_ANSWER",
        "programTypeForInvoker": "anyProgramTypeForInvoker",
        "comparisonType": "GREATER_OR_EQUAL",
        "judgedTestCount": "10",
        "participantSubstring": "",
        "_tta": "24"
    }

    # Send POST request
    response = requests.post(base_url, headers=headers, data=data)

    # Save response to file for debugging
    with open("codeforces_response.html", "w", encoding="utf-8") as file:
        file.write(response.text)
    
    print("Saved response to codeforces_response.html")

    if response.status_code != 200:
        print(f"Failed to fetch submissions: {response.status_code}")
        return []

    # Parse HTML response
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract all submission links
    submission_links = [
        f"https://codeforces.com{link['href']}"
        for link in soup.find_all("a", href=True)
        if "/problemset/submission/" in link["href"]
    ]

    return submission_links

# Example: Fetch WA submissions for problem 4A (Watermelon)
problem_id = "4"
submission_links = get_wrong_answer_submissions(problem_id)

# Display first 10 submission links
print("\n".join(submission_links[:10]))

Saved response to codeforces_response.html

