In [11]:
!pip install selenium
!apt-get update
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

In [12]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import json

def get_company_links_with_selenium(target_count):
    company_links = set()
    url = "https://www.ycombinator.com/companies"

    print("Initializing browser...")
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=chrome_options)

    driver.get(url)

    print(f"Starting scroll to find at least {target_count} companies...")
    last_height = driver.execute_script("return document.body.scrollHeight")

    while len(company_links) < target_count:
        print(f"Found {len(company_links)} links so far. Scrolling down for more...")

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

        selector = "a[class^='_company_']"
        links = driver.find_elements(By.CSS_SELECTOR, selector)
        for link in links:
            href = link.get_attribute('href')
            if href:
                company_links.add(href)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print("Reached the bottom of the page.")
            break
        last_height = new_height

    driver.quit()
    print(f"Finished scrolling. Found a total of {len(company_links)} unique links.")
    return list(company_links)

def scrape_company_data(company_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(company_url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        data_div = soup.find('div', attrs={'data-page': True})
        if not data_div:
            return None

        page_data = json.loads(data_div['data-page'])
        company_props = page_data.get('props', {}).get('company', {})

        if not company_props:
            return None

        company_name = company_props.get('name', "Not Found")
        batch = company_props.get('batch_name', "Not Found")
        description = company_props.get('one_liner', "Not Found")

        founders_data = company_props.get('founders', [])
        founder_names = [f.get('full_name', 'N/A') for f in founders_data]
        founder_linkedin_urls = [f.get('linkedin_url', 'N/A') for f in founders_data]

        return {
            "Company Name": company_name,
            "Batch": batch,
            "Short Description": description,
            "Founder Name(s)": ", ".join(founder_names) if founder_names else "Not Found",
            "Founder LinkedIn URL(s)": ", ".join(url for url in founder_linkedin_urls if url) if founder_linkedin_urls else "Not Found"
        }
    except Exception:
        return None

if __name__ == "__main__":
    TARGET_COMPANY_COUNT = 500
    company_links = get_company_links_with_selenium(TARGET_COMPANY_COUNT)

    if not company_links:
        print("\nCould not find any company links to scrape.")
    else:
        print(f"\nSUCCESS! Now scraping the full details for {len(company_links)} companies...")
        all_startup_data = []
        links_to_scrape = company_links[:TARGET_COMPANY_COUNT]

        for i, link in enumerate(links_to_scrape):
            print(f"Scraping {i + 1} of {len(links_to_scrape)}: {link.split('/')[-1]}")
            data = scrape_company_data(link)
            if data:
                all_startup_data.append(data)
            time.sleep(0.1)

        df = pd.DataFrame(all_startup_data)
        df.to_csv("CompaniesDetails.csv", index=False)

        print(f"\nScraping complete! Data for {len(df)} companies saved to 'CompaniesDetails.csv'")
        print("Displaying the first 10 rows of the scraped data:")
        from IPython.display import display
        display(df.head(10))

Initializing browser...
Starting scroll to find at least 500 companies...
Found 0 links so far. Scrolling down for more...
Found 40 links so far. Scrolling down for more...
Found 220 links so far. Scrolling down for more...
Found 420 links so far. Scrolling down for more...
Finished scrolling. Found a total of 740 unique links.

SUCCESS! Now scraping the full details for 740 companies...
Scraping 1 of 500: virtualmin
Scraping 2 of 500: 9gag
Scraping 3 of 500: youlearn
Scraping 4 of 500: reditus-space
Scraping 5 of 500: wefunder
Scraping 6 of 500: qualgent
Scraping 7 of 500: x-zell
Scraping 8 of 500: plangrid
Scraping 9 of 500: wevorce
Scraping 10 of 500: stratify
Scraping 11 of 500: the-robot-learning-company
Scraping 12 of 500: gale
Scraping 13 of 500: coblocks
Scraping 14 of 500: valuemate
Scraping 15 of 500: benchling
Scraping 16 of 500: golf
Scraping 17 of 500: mth-sense
Scraping 18 of 500: getcho
Scraping 19 of 500: shape-shapescale
Scraping 20 of 500: protocol-labs
Scraping 21 of

Unnamed: 0,Company Name,Batch,Short Description,Founder Name(s),Founder LinkedIn URL(s)
0,Virtualmin,Winter 2007,,"Jamie Cameron, Joe Cooper",https://www.linkedin.com/in/jamiecameron2
1,9gag,Summer 2012,Make the world happier.,"Chris Chan, Derek Chan, Ray Chan, Marco Fung, ...","https://linkedin.com/in/raychan, https://www.l..."
2,YouLearn,Spring 2025,AI tutor for each student.,"David Yu, Achyut Krishna Byanjankar, Soami Kap...",https://www.linkedin.com/in/david-yu-871202244...
3,Reditus Space,Winter 2025,Reusable Satellites for zero-g manufacturing,"Stef Crum, Will Sherman","https://www.linkedin.com/in/spcrum/, https://w..."
4,Wefunder,Winter 2013,We're Robinhood for pre-IPO startups. Everyone...,"Nick Tommarello, Greg Belote","https://www.linkedin.com/in/nicktommarello/, h..."
5,QualGent,Spring 2025,AI Mobile App Quality Assurance Tester,"Shivam Agrawal, Aaron Yu","https://www.linkedin.com/in/shivam-agrawal, ht..."
6,X-Zell,Winter 2015,"Early, pain-free cancer detection.","Sebastian Punyaratabandhu Bhakdi, Prapat Suriy...",https://www.linkedin.com/pub/sebastian-chakrit...
7,PlanGrid,Winter 2012,Mobile applications for the construction indus...,"Ralph Gootee, Ryan Sutton-Gee, Tracy Young","https://www.linkedin.com/in/ralphleon/, https:..."
8,Wevorce,Winter 2013,Changing divorce for good by making it a way l...,Michelle Crosby,https://www.linkedin.com/in/michelle-crosby-93...
9,stratify,Spring 2025,AI agents that help you understand your custom...,"Siddhartha Javvaji, Pratham Hombal",https://www.linkedin.com/in/siddhartha-javvaji...
