In [174]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

import time
import json
import os

## kaggle

- Gather links for each competition

In [120]:
base_url = 'https://www.kaggle.com/competitions?sortOption=reward&page='

# Create a WebDriver instance for Chrome


def kaggle_comp_links(start_page, end_page):
    base_url = 'https://www.kaggle.com/competitions?sortOption=reward&page='
    top_competitions = []

    driver = webdriver.Chrome()
    driver.maximize_window()

    # Note, it seems that we can only scrape 17 pages at a time. Adjust range() to adjust pages you want to scrape
    for page in range(start_page, end_page+1):
        time.sleep(1.5)
        if page == 1:
            url = 'https://www.kaggle.com/competitions?sortOption=reward'  # First page URL
        else:
            url = f'{base_url}{page}'  # Subsequent pages URL
        
        # Visit the page
        driver.get(url)
        
        # Optional: Click all competitions button if needed - assuming this needs to be clicked each time
        if page == 1:
            try:
                button = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="site-content"]/div[2]/div/div[4]/div/div[2]/div/div[1]/button[1]'))
                )
                button.click()
            except Exception as e:
                print(f"Failed to click 'All Competitions' button on page {page}: {str(e)}")
        
        # Wait for the links to be visible and collect them
        try:
            competition_links = WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((By.XPATH, '//*[@id="site-content"]/div[2]/div/div[5]/div/div/div/ul/li/div/a'))
            )
            top_competitions.extend([link.get_attribute('href') for link in competition_links])
        except Exception as e:
            print(f"Failed to collect links on page {page}: {str(e)}")
    driver.quit()
    return top_competitions

- Difficulty in scraping all 34 pages at once, split into two part.

In [123]:
top_competitions_1 = kaggle_comp_links(1, 18)
top_competitions_2 = kaggle_comp_links(19, 34)

In [124]:
len(top_competitions_1), len(top_competitions_2)

(360, 315)

In [125]:
top_competitions_1[:2]

['https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-2',
 'https://www.kaggle.com/competitions/passenger-screening-algorithm-challenge']

- Scraping data from each competition

In [126]:
def page_extraction(top_competitions):
    competition_data = []

    for url in top_competitions:
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(1)  # Ensure the page loads completely
        
        # Extract competition names
        try:
            competition_name = WebDriverWait(driver, 2).until(
                EC.visibility_of_element_located((By.XPATH, '//*[@id="site-content"]/div[2]/div/div/div[2]/div[2]/div[1]/h1'))
            ).text
        except:
            competition_name = "Competition name not found"

        # Extract the overview text
        try:
            overview_text = WebDriverWait(driver, 2).until(
                EC.visibility_of_element_located((By.XPATH, '//*[@id="abstract"]/div[1]/div[2]/div/p'))
            ).text
        except:
            overview_text = "Overview text not found"

        # Extract all paragraphs in the description section
        try:
            description_paragraphs = WebDriverWait(driver, 2).until(
                EC.presence_of_all_elements_located((By.XPATH, '//*[@id="description"]/div/div[2]/div/div/p'))
            )
            description_text = ' '.join([para.text for para in description_paragraphs])
        except:
            description_text = "Description text not found"

        try:
            driver.get(url + '/data')
            time.sleep(1)
            dataset_paragraphs = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.XPATH, '//*[@id="site-content"]/div[2]/div/div/div[6]/div[1]/div[1]/div/div[2]/div/div[1]/div/div/div/p'))
            )
            dataset_description = ' '.join([para.text for para in dataset_paragraphs])
        except:
            dataset_description = "Dataset description not found"

        # Store the competition url, overview text, and description text
        competition_data.append({
            'name' : competition_name,
            'url': url,
            'overview_text': overview_text,
            'description_text': description_text,
            'dataset_text' : dataset_description
        })
    return competition_data


In [84]:
competition_data_2 = page_extraction(top_competitions_2)
# Output the data
for data in competition_data_2[:5]:
    print(data)

{'name': 'CHALEARN Gesture Challenge 2', 'url': 'https://www.kaggle.com/competitions/GestureChallenge2', 'overview_text': 'Overview text not found', 'description_text': 'This competition is identitical to the first round of the CHALEARN gesture challenge, the only difference is that is will be judged on new fresh final evaluation data. Keep informed of new data releases and new events, sign up to the gesturechallenge group. This challenge is organized by CHALEARN and is sponsored in part by Microsoft (Kinect for Xbox 360). Other sponsors include Texas Instrument. This effort was initiated by the DARPA Deep Learning program and is supported by the US National Science Foundation (NSF) under grants ECCS 1128436 and ECCS 1128296 , the EU Pascal2 network of excellence. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the sponsors.', 'dataset_text': '  We are portraying a single user in 

In [85]:
output_directory = '../Data'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Define the output file path
output_file_path = os.path.join(output_directory, 'kaggle_data_first_page_2.json')

# Write the data to a JSON file
with open(output_file_path, 'w') as json_file:
    json.dump(competition_data_2, json_file, indent=4)

print(f"Data successfully written to {output_file_path}")

Data successfully written to ../Data/kaggle_data_first_page_2.json


#### Checking the length

In [87]:
with open('../Data/kaggle_data_first_page_2.json', 'r') as file:
    kaggle_comps = json.load(file)

len(kaggle_comps)

334

## Section 2: Eval.Ai

- Open the major website

In [104]:
website = 'https://eval.ai/web/challenges/list'
# Create a WebDriver instance for Chrome
driver = webdriver.Chrome()
driver.maximize_window()
# Visit the website
driver.get(website)

- Gather the competition webpage links

In [105]:
# Wait for the page to load (this might require adjusting depending on page load time)
driver.implicitly_wait(10)  # Adjust the wait time as necessary

base_xpath = '//*[@id="page-wrap"]/div/div/div/ui-view/ui-view/section/div[2]/div'

# List to hold links
competition_links = []

# Loop through the first two competition divs
for i in range(1, 3):  # Since XPath index starts at 1 and we need first two competitions
    competition_xpath = f'{base_xpath}[{i}]/a'
    # Find the <a> element and get the href attribute
    competition_link = driver.find_element(By.XPATH, competition_xpath).get_attribute('href')
    competition_links.append(competition_link)

# Print the links
for data in competition_links:
    print(data)

https://eval.ai/web/challenges/challenge-page/2429
https://eval.ai/web/challenges/challenge-page/2418


- Scraping from each competition

In [106]:
competition_data_eval = []
for url in competition_links:
    driver.get(url)
    time.sleep(1)  # Ensure the page loads completely
    try:
        paragraphs_xpath = '//*[@id="page-wrap"]/div/div/div/ui-view/ui-view/ui-view/section/div/div[2]/div/div/p'

        # Wait until the presence of all paragraph elements is located
        description_paragraphs = WebDriverWait(driver, 2).until(
            EC.presence_of_all_elements_located((By.XPATH, paragraphs_xpath))
        )

        # Extract text from each paragraph
        competition_overview = ' '.join([paragraph.text for paragraph in description_paragraphs])
    except:
        competition_overview = "Overview text not found"

    try: 
        name_xpath = '//*[@id="page-wrap"]/div/div/div/ui-view/ui-view/section/div/div[1]/div[2]/div/h4'
        competition_name = WebDriverWait(driver, 2).until(
            EC.visibility_of_element_located((By.XPATH, name_xpath))
        ).text
    except:
        name = "Name of competition not found"
    competition_data_eval.append({'url' : url, 
                                'overview' : competition_overview,
                                'name' : competition_name})        

competition_data_eval
    

[{'url': 'https://eval.ai/web/challenges/challenge-page/2429',
  'overview': 'Surgical action triplet detection To detect surgical activities as triplets of {`instruments, verb, target`} where :',
  'name': 'CholecTriplet Challenge Detection Evaluation'},
 {'url': 'https://eval.ai/web/challenges/challenge-page/2418',
  'overview': "OpenAD is the first open-world 3D object detection benchmark for autonomous driving. We meticulously selected 2,000 scenes from 5 public datasets and annotated 6,597 3D corner cases for these scenes. Together with the original annotations of these scenes, there are 19,761 objects belonging to 206 different categories. You can utilize OpenAD to evaluate your model's open-world capabilities, encompassing scene generalization, cross-vehicle-type adaptability, open-vocabulary proficiency, and corner case detection aptitude. We provide a toolkit to organize data, load data, and evaluate your model with simple commands. Access the data and code here.",
  'name': '

In [139]:
driver.quit()

## drivendata

In [141]:
website = 'https://www.drivendata.org/competitions/search/?sort=total_prize_purse'
# Create a WebDriver instance for Chrome
driver = webdriver.Chrome()
driver.maximize_window()
# Visit the website
driver.get(website)

In [145]:
try:
    # Wait for the competition list div to load and locate it using its ID
    competition_list_div = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.ID, "competition-list"))
    )

    # Find all <a> tags within nested layers of the competition list div that have the specific class
    competition_links = competition_list_div.find_elements(By.XPATH, ".//a[@class='text-decoration-none'][@href]")

    # Extract href attributes from the first five links only (corrected limit comment)
    hrefs = [link.get_attribute('href') for link in competition_links]  # Limit to first five links
    
    # Output the collected links
    for href in hrefs:
        print(href)
except:
    # Clean up: close the browser window
    hrefs = []

https://www.drivendata.org/competitions/group/nist-federated-learning/
https://www.drivendata.org/competitions/group/nih-nia-alzheimers-adrd-competition/
https://www.drivendata.org/competitions/group/uk-federated-learning/
https://www.drivendata.org/competitions/group/reclamation-water-supply-forecast/
https://www.drivendata.org/competitions/group/competition-reclamation-snow-water/
https://www.drivendata.org/competitions/group/image-similarity-challenge/
https://www.drivendata.org/competitions/group/competition-differential-privacy-deid2/
https://www.drivendata.org/competitions/group/meta-video-similarity/
https://www.drivendata.org/competitions/group/hateful-memes/
https://www.drivendata.org/competitions/group/cdc-narratives/
https://www.drivendata.org/competitions/217/cdc-fall-narratives/
https://www.drivendata.org/competitions/63/genetic-engineering-attribution/
https://www.drivendata.org/competitions/group/competition-nasa-airport-pushback/
https://www.drivendata.org/competitions/

In [146]:
hrefs

['https://www.drivendata.org/competitions/group/nist-federated-learning/',
 'https://www.drivendata.org/competitions/group/nih-nia-alzheimers-adrd-competition/',
 'https://www.drivendata.org/competitions/group/uk-federated-learning/',
 'https://www.drivendata.org/competitions/group/reclamation-water-supply-forecast/',
 'https://www.drivendata.org/competitions/group/competition-reclamation-snow-water/',
 'https://www.drivendata.org/competitions/group/image-similarity-challenge/',
 'https://www.drivendata.org/competitions/group/competition-differential-privacy-deid2/',
 'https://www.drivendata.org/competitions/group/meta-video-similarity/',
 'https://www.drivendata.org/competitions/group/hateful-memes/',
 'https://www.drivendata.org/competitions/group/cdc-narratives/',
 'https://www.drivendata.org/competitions/217/cdc-fall-narratives/',
 'https://www.drivendata.org/competitions/63/genetic-engineering-attribution/',
 'https://www.drivendata.org/competitions/group/competition-nasa-airport-

In [147]:
main_competition_links = hrefs

# Dictionary to hold all sub-competition links for each main competition
all_sub_competition_links = {}

# Iterate over each main competition link
for main_link in main_competition_links:
    driver.get(main_link)
    try:
        # Wait for the sub-competition divs to load
        sub_competition_divs = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "competition-subgroup"))
        )
        # Collect all hrefs from <a> tags within each subgroup
        sub_competition_hrefs = []
        for div in sub_competition_divs:
            sub_competition_links = div.find_elements(By.XPATH, ".//a[@href]")
            for link in sub_competition_links:
                href = link.get_attribute('href')
                if href not in sub_competition_hrefs:
                    sub_competition_hrefs.append(href)

        # Store the collected sub-competition links
        all_sub_competition_links[main_link] = sub_competition_hrefs
    except Exception as e:
        print(f"Error processing {main_link}: {str(e)}")
        all_sub_competition_links[main_link] = []

# Output the collected links for each competition
for main_link, sub_links in all_sub_competition_links.items():
    print(f"Main Competition: {main_link}")
    for link in sub_links:
        print(f"  Sub-Competition: {link}")

Error processing https://www.drivendata.org/competitions/217/cdc-fall-narratives/: Message: 

Error processing https://www.drivendata.org/competitions/63/genetic-engineering-attribution/: Message: 

Error processing https://www.drivendata.org/competitions/78/overhead-geopose-challenge/: Message: 

Error processing https://www.drivendata.org/competitions/48/identify-fish-challenge/: Message: 

Error processing https://www.drivendata.org/competitions/96/beluga-whales/: Message: 

Error processing https://www.drivendata.org/competitions/298/literacy-screening/: Message: 

Error processing https://www.drivendata.org/competitions/252/ai-research-assistants/: Message: 

Error processing https://www.drivendata.org/competitions/143/tick-tick-bloom/: Message: 

Error processing https://www.drivendata.org/competitions/97/nasa-mars-gcms/: Message: 

Error processing https://www.drivendata.org/competitions/93/nasa-mars-spectrometry/: Message: 

Error processing https://www.drivendata.org/competiti

KeyboardInterrupt: 

In [149]:
# Assuming you have the webdriver and main_competition_links defined
main_competition_links = hrefs

# Dictionary to hold all sub-competition links for each main competition
all_sub_competition_links = {}

# Setup the webdriver
driver = webdriver.Chrome()  # Adjust this line if you use a different browser

# Iterate over each main competition link
for main_link in main_competition_links:
    driver.get(main_link)
    try:
        # Attempt to find sub-competition divs
        sub_competition_divs = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "competition-subgroup"))
        )
        
        # Collect all hrefs from <a> tags within each subgroup
        sub_competition_hrefs = []
        for div in sub_competition_divs:
            sub_competition_links = div.find_elements(By.XPATH, ".//a[@href]")
            for link in sub_competition_links:
                href = link.get_attribute('href')
                if href not in sub_competition_hrefs:
                    sub_competition_hrefs.append(href)

        # Store the collected sub-competition links
        if sub_competition_hrefs:
            all_sub_competition_links[main_link] = sub_competition_hrefs
        else:
            # If no <a> tags are found within subgroups
            all_sub_competition_links[main_link] = [main_link]

    except Exception as e:
        # If error is thrown, assume no sub-competitions are present
        print(f"No sub-competitions found for {main_link}, using main link as sub-competition.")
        all_sub_competition_links[main_link] = [main_link]

# Close the driver
driver.quit()

# Output the collected links for each competition
for main_link, sub_links in all_sub_competition_links.items():
    print(f"Main Competition: {main_link}")
    for link in sub_links:
        print(f"  Sub-Competition: {link}")

No sub-competitions found for https://www.drivendata.org/competitions/217/cdc-fall-narratives/, using main link as sub-competition.
No sub-competitions found for https://www.drivendata.org/competitions/63/genetic-engineering-attribution/, using main link as sub-competition.
No sub-competitions found for https://www.drivendata.org/competitions/78/overhead-geopose-challenge/, using main link as sub-competition.
No sub-competitions found for https://www.drivendata.org/competitions/48/identify-fish-challenge/, using main link as sub-competition.
No sub-competitions found for https://www.drivendata.org/competitions/96/beluga-whales/, using main link as sub-competition.
No sub-competitions found for https://www.drivendata.org/competitions/298/literacy-screening/, using main link as sub-competition.
No sub-competitions found for https://www.drivendata.org/competitions/252/ai-research-assistants/, using main link as sub-competition.
No sub-competitions found for https://www.drivendata.org/comp

In [161]:
list(all_sub_competition_links.items())[-1]
flattened_links = [item for sublist in all_sub_competition_links.values() for item in sublist]

In [177]:
all_sub_competition_links

{'https://www.drivendata.org/competitions/group/nist-federated-learning/': ['https://www.drivendata.org/competitions/98/nist-federated-learning-1/',
  'https://www.drivendata.org/competitions/search/?category=privacy',
  'https://www.drivendata.org/competitions/search/?type=privacy',
  'https://www.drivendata.org/competitions/105/nist-federated-learning-2-financial-crime-federated/',
  'https://www.drivendata.org/competitions/144/nist-federated-learning-2-financial-crime-centralized/',
  'https://www.drivendata.org/competitions/103/nist-federated-learning-2-pandemic-forecasting-federated/',
  'https://www.drivendata.org/competitions/145/nist-federated-learning-2-pandemic-forecasting-centralized/',
  'https://www.drivendata.org/competitions/139/nist-federated-learning-3-red-teams/'],
 'https://www.drivendata.org/competitions/group/nih-nia-alzheimers-adrd-competition/': ['https://www.drivendata.org/competitions/301/prepare-challenge-phase-2-report-arena/',
  'https://www.drivendata.org/c

In [182]:
filtered_links = [link for link in flattened_links if 'https://www.drivendata.org/competitions/search/' not in link]

In [189]:
# Setting up Chrome options
options = Options()
options.add_argument("--start-maximized")

# Dictionary to hold the description links
description_links = {}

# Function to scrape description links
def scrape_description_links(link, driver):
    try:
        # Set a page load timeout
        driver.set_page_load_timeout(30)  # Timeout after 30 seconds
        driver.get(link)
        
        # List to hold the description section links
        section_links = []
        # Wait for the li elements to be present, or timeout after 15 seconds
        # lis = WebDriverWait(driver, 15).until(
        #     EC.presence_of_all_elements_located((By.XPATH, '//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li'))
        # )
        lis = WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@id="main-container"]/div/div[2]/div[1]/div[contains(@class, "nav-subpages")]/ul/li'))
        )
        for li in lis:
            a_tag = li.find_element(By.TAG_NAME, 'a')
            href = a_tag.get_attribute('href')
            section_links.append(href)
        
        # Store the list of description links under the corresponding competition link
        description_links[link] = section_links
        return True  # Indicate success
    except TimeoutException:
        print(f"Timeout while processing {link}")
        return False  # Indicate failure
    except Exception as e:
        print(f"Error processing {link}: {e}")
        return False  # Indicate failure

# Set up the WebDriver
driver = webdriver.Chrome(options=options)

# Counters for tracking progress
total_competitions = len(filtered_links)
successful_scrapes = 0
failed_scrapes = 0

# Iterate over all links
for index, link in enumerate(filtered_links, start=1):
    print(f"Processing {index} of {total_competitions}: {link}")
    if scrape_description_links(link, driver):
        successful_scrapes += 1
    else:
        failed_scrapes += 1

driver.quit()

# Report the results
print(f"Completed scraping. Successful: {successful_scrapes}, Failed: {failed_scrapes}")


Processing 1 of 100: https://www.drivendata.org/competitions/98/nist-federated-learning-1/
Processing 2 of 100: https://www.drivendata.org/competitions/105/nist-federated-learning-2-financial-crime-federated/
Processing 3 of 100: https://www.drivendata.org/competitions/144/nist-federated-learning-2-financial-crime-centralized/
Processing 4 of 100: https://www.drivendata.org/competitions/103/nist-federated-learning-2-pandemic-forecasting-federated/
Processing 5 of 100: https://www.drivendata.org/competitions/145/nist-federated-learning-2-pandemic-forecasting-centralized/
Processing 6 of 100: https://www.drivendata.org/competitions/139/nist-federated-learning-3-red-teams/
Processing 7 of 100: https://www.drivendata.org/competitions/301/prepare-challenge-phase-2-report-arena/
Processing 8 of 100: https://www.drivendata.org/competitions/299/competition-nih-alzheimers-acoustic-2/
Processing 9 of 100: https://www.drivendata.org/competitions/300/competition-nih-alzheimers-sdoh-2/
Processing 1

In [190]:
description_links

{'https://www.drivendata.org/competitions/98/nist-federated-learning-1/': ['https://www.drivendata.org/competitions/98/nist-federated-learning-1/page/522/',
  'https://www.drivendata.org/competitions/98/nist-federated-learning-1/page/521/',
  'https://www.drivendata.org/competitions/98/nist-federated-learning-1/page/524/',
  'https://www.drivendata.org/competitions/98/nist-federated-learning-1/page/525/',
  'https://www.drivendata.org/competitions/98/nist-federated-learning-1/rules/',
  'https://www.drivendata.org/competitions/98/nist-federated-learning-1/participants/',
  'https://www.drivendata.org/competitions/group/nist-federated-learning/'],
 'https://www.drivendata.org/competitions/105/nist-federated-learning-2-financial-crime-federated/': ['https://www.drivendata.org/competitions/105/nist-federated-learning-2-financial-crime-federated/page/591/',
  'https://www.drivendata.org/competitions/105/nist-federated-learning-2-financial-crime-federated/page/590/',
  'https://www.drivenda

In [191]:
# Dictionary to hold the final text data
final_texts = {}

# Function to scrape description links
def scrape_description_texts(link, driver):
    try:
        driver.set_page_load_timeout(30)
        driver.get(link)

        # Accumulate all texts
        all_texts = []
        # Find the div with class 'prose' and get all direct <p> children
        try:
            prose_div = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.prose'))
            )
            paragraphs = prose_div.find_elements(By.XPATH, './p')
            for paragraph in paragraphs:
                text = paragraph.text
                if text:  # Check if text is not empty
                    all_texts.append(text)
        except NoSuchElementException:
            print(f"No 'prose' class found in {link}")

        return ' '.join(all_texts)  # Return concatenated text
    except TimeoutException:
        print(f"Timeout while processing {link}")
        return ""
    except Exception as e:
        print(f"Error processing {link}: {e}")
        return ""

# Set up the WebDriver
driver = webdriver.Chrome(options=options)

# Iterate over all competition links and their corresponding description page links
for competition_link, links in description_links.items():
    all_competition_texts = []
    for link in links:
        result_text = scrape_description_texts(link, driver)
        if result_text:
            all_competition_texts.append(result_text)
    # Store concatenated texts for each competition link
    final_texts[competition_link] = ' '.join(all_competition_texts)

driver.quit()

# Print or save the result
print(json.dumps(final_texts, indent=4))

Error processing https://www.drivendata.org/competitions/259/reclamation-water-supply-forecast/data/: name 'NoSuchElementException' is not defined
Error processing https://www.drivendata.org/competitions/262/reclamation-water-supply-forecast-final/data/: name 'NoSuchElementException' is not defined
Error processing https://www.drivendata.org/competitions/257/reclamation-water-supply-forecast-hindcast/data/: name 'NoSuchElementException' is not defined
Error processing https://www.drivendata.org/competitions/254/reclamation-water-supply-forecast-dev/data/: name 'NoSuchElementException' is not defined
Error processing https://www.drivendata.org/competitions/90/competition-reclamation-snow-water-eval/data/: name 'NoSuchElementException' is not defined
Error processing https://www.drivendata.org/competitions/86/competition-reclamation-snow-water-dev/data/: name 'NoSuchElementException' is not defined
Error processing https://www.drivendata.org/competitions/256/pale-blue-dot/data/: name 'No

In [193]:
len(final_texts)

100

In [None]:
file_path = '../Data/drivendata_raw_scraping.json'

# Writing the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(final_texts, json_file, indent=4)

In [197]:
with open('./Data', 'w') as json_file:
    json.dump(description_links, json_file, indent=4)

In [172]:
driver.quit()

## AI CROWD

In [27]:
driver = webdriver.Chrome()
driver.maximize_window()
# Navigate to the website
driver.get('https://www.aicrowd.com/challenges')


In [16]:
# Script to scroll to the bottom of the page

for _ in range(17):
    # Scroll down to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait for a short period to ensure the page has loaded the content
    time.sleep(3)  # Adjust the sleep time if necessary based on the page's response time


In [18]:
links = driver.find_elements(By.XPATH, '//*[@id="challenges-div"]//a[contains(@class, "card-img-overlay")]')
# //*[@id="challenges-div"]/div[1]/div/div[2]/div[1]/h5/a
# Extract href attributes from each link element
urls = [link.get_attribute('href') for link in links]

# Output or process the URLs
for url in urls[:5]:
    print(url)

https://www.aicrowd.com/challenges/brick-by-brick-2024
https://www.aicrowd.com/challenges/sounding-video-generation-svg-challenge-2024
https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024
https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms
https://www.aicrowd.com/challenges/generative-interior-design-challenge-2024


In [92]:
len(urls)

317

In [94]:
aicrowd_competitions_data = []
driver = webdriver.Chrome()
driver.maximize_window()

for url in urls:
    driver.get(url)
    time.sleep(1)  # Ensure the page loads completely

    # Extract all paragraph texts under the specified div for overview
    paragraphs = driver.find_elements(By.XPATH, '//*[@data-controller="challenge-overview"]//p')
    overview_text = " ".join([p.text for p in paragraphs if p.text])

    # Navigate to the rules page
    nav_links = driver.find_elements(By.XPATH, '//a[contains(@class, "nav-link")]')
    rules_page_url = None
    for link in nav_links:
        if 'rules' in link.get_attribute('href').lower():  # Assuming the URL contains the word 'rules'
            rules_page_url = link.get_attribute('href')
            break

    if rules_page_url:
        driver.get(rules_page_url)
    time.sleep(1)  # Ensure the rules page loads

    # Extract all paragraph texts for rules
    paragraphs = driver.find_elements(By.XPATH, '/html/body/div[2]/main/div[2]/div/div/div/div/p')
    rules_text = " ".join([p.text for p in paragraphs if p.text])

    # Create a dictionary for this competition and append to the list
    competition_data = {
        'url': url,
        'overview': overview_text,
        'rules': rules_text
    }
    aicrowd_competitions_data.append(competition_data)

# Save data to a JSON file
# with open('competitions_data.json', 'w') as file:
#     json.dump(aicrowd_competitions_data, file, indent=4)

In [98]:
len(aicrowd_competitions_data)

317

In [96]:
driver.quit()

In [97]:
with open('../Data/aicrowd_raw.json', 'w') as outfile:
        json.dump(aicrowd_competitions_data, outfile, indent=4)