In [1]:
!pip install pandas requests beautifulsoup4 lxml selenium webdriver-manager




In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import random

# Selenium imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# ---- File configuration ----
INPUT_CSV = "Growth For Impact Data Assignment2_Data.csv"   # change if your file is named differently
OUTPUT_XLSX = "GrowthForImpact_Assignment_Final_SarveshGoswami.xlsx"

# ---- Request headers (so we look like a normal browser) ----
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/120.0 Safari/537.36"
)

HEADERS = {
    "User-Agent": USER_AGENT,
    "Accept-Language": "en-US,en;q=0.9"
}

# ---- Keywords to detect careers / jobs / ATS links ----
CAREER_KEYWORDS = [
    "career", "careers", "jobs", "job", "join", "team",
    "work", "opening", "openings", "opportunities", "opportunity"
]

ATS_KEYWORDS = [
    "lever.co", "greenhouse.io", "bamboohr.com", "myworkdayjobs.com",
    "zohorecruit", "workday", "recruitee", "freshteam",
    "jazzhr", "applytojob", "smartrecruiters"
]


In [3]:
df = pd.read_csv(INPUT_CSV)
df.head()


Unnamed: 0,Startup,Website URL,Unnamed: 2,Careers Page URL,Job listings page URL,job post1 URL,job post1 title,job post2 URL,job post2 title,job post3 URL,job post3 title
0,Thoughtful Foods,https://www.thoughtfulfood.co/,Example1-,https://fsc.org/en/careers-at-fsc,https://fsc.org/en/careers-at-fsc,https://fsc.jobs.personio.com/job/2262148?lang...,Market Intelligence Manager (m-f-d),https://fsc.jobs.personio.com/job/2268639?lang...,Trademark Manager (m-f-d),https://fsc.jobs.personio.com/job/2262183?lang...,Value Chain Development Manager (m-f-d)
1,Charzer,https://www.charzer.com/home,Example2-,https://www.polestar.com/global/about/careers/,https://polestar.teamtailor.com/jobs,https://polestar.teamtailor.com/jobs/6551682-p...,"Planning, Ordering & Distribution Manager - Pa...",https://polestar.teamtailor.com/jobs/6538269-f...,Financial Accounting Manager - UK,https://polestar.teamtailor.com/jobs/6517579-r...,Retail Operation Manager
2,Pilk,https://www.pilkfoods.com.au/,,,,,,,,,
3,Beano,https://beano.com.sg/,,,,,,,,,
4,RACEnergy,https://race.energy/,,,,,,,,,


In [4]:
def normalize_url(url):
    """Make sure URL has https:// and is not empty."""
    if pd.isna(url) or not isinstance(url, str) or not url.strip():
        return None
    url = url.strip()
    if not url.startswith(("http://", "https://")):
        url = "https://" + url.lstrip("/")
    return url


def fetch_static_html(url, timeout=12):
    """Download HTML using requests. Return text or None."""
    if not url:
        return None
    try:
        resp = requests.get(url, headers=HEADERS, timeout=timeout)
        if resp.status_code == 200 and resp.text:
            return resp.text
    except Exception as e:
        print("requests error for", url, ":", e)
    return None


In [5]:
def make_driver():
    """Create a headless Chrome WebDriver."""
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1366,768")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Create global driver once
driver = make_driver()

def fetch_rendered_html(url, wait=5):
    """Use Selenium to render JS-heavy pages and return HTML."""
    if not url:
        return None
    try:
        driver.get(url)
        time.sleep(wait)
        return driver.page_source
    except Exception as e:
        print("Selenium error for", url, ":", e)
        return None



In [6]:
def find_careers_and_jobs(base_url):
    """
    Given a company's main website URL:
    - Find a careers page (via static HTML, then Selenium if needed)
    - From careers page, find job listings / ATS page
    Return (careers_page_url, job_listings_page_url)
    """
    base_url = normalize_url(base_url)
    if not base_url:
        return None, None

    def scan_links_for_careers_and_ats(html, context_url):
        """Scan all <a> tags and collect possible careers / ATS URLs."""
        soup = BeautifulSoup(html, "lxml")
        candidates = []

        for a in soup.find_all("a", href=True):
            href = a["href"].strip()
            text = (a.get_text(" ", strip=True) or "").lower()
            full = urljoin(context_url, href)
            combined = (href + " " + text).lower()

            # Careers-like link
            if any(kw in combined for kw in CAREER_KEYWORDS):
                candidates.append(full)

            # ATS-like link
            if any(kw in combined for kw in ATS_KEYWORDS):
                candidates.append(full)

        # Remove duplicates while preserving order
        seen = set()
        final = []
        for url in candidates:
            if url not in seen:
                seen.add(url)
                final.append(url)
        return final

    # ----- 1) Try static homepage -----
    html = fetch_static_html(base_url)
    careers_url = None

    if html:
        links = scan_links_for_careers_and_ats(html, base_url)
        if links:
            careers_url = links[0]

    # ----- 2) Selenium fallback if no careers found -----
    if not careers_url:
        html = fetch_rendered_html(base_url)
        if html:
            links = scan_links_for_careers_and_ats(html, base_url)
            if links:
                careers_url = links[0]

    # If still nothing, give up
    if not careers_url:
        return None, None

    careers_url = normalize_url(careers_url)

    # ----- 3) From careers page, find job listings / ATS -----
    html_c = fetch_static_html(careers_url)
    if not html_c:
        html_c = fetch_rendered_html(careers_url)

    if not html_c:
        return careers_url, None

    soup = BeautifulSoup(html_c, "lxml")
    jobs_url = None
    candidates = []

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        text = (a.get_text(" ", strip=True) or "").lower()
        full = urljoin(careers_url, href)
        combined = (href + " " + text).lower()

        if any(kw in combined for kw in ATS_KEYWORDS) or "job" in combined or "opening" in combined:
            candidates.append(full)

    if candidates:
        jobs_url = candidates[0]
    else:
        # Some sites list jobs directly on the careers page
        jobs_url = careers_url

    return careers_url, jobs_url


In [7]:
def extract_job_posts(jobs_url, max_posts=3):
    """
    Given a job listings page URL, extract up to `max_posts` job URLs and titles.
    Returns: (list_of_urls, list_of_titles), both length == max_posts (padded with None)
    """
    jobs_url = normalize_url(jobs_url)
    if not jobs_url:
        return [None] * max_posts, [None] * max_posts

    def parse_jobs_from_html(html, context_url):
        soup = BeautifulSoup(html, "lxml")
        results = []

        for a in soup.find_all("a", href=True):
            href = a["href"].strip()
            text = (a.get_text(" ", strip=True) or "").strip()
            if not text or len(text) < 3:
                continue
            combined = (href + " " + text).lower()

            # Heuristic: looks like a job posting
            if ("job" in combined or "apply" in combined or
                "opening" in combined or "position" in combined):
                full = urljoin(context_url, href)
                results.append((full, text))

        # Deduplicate
        seen = set()
        final = []
        for url, title in results:
            if url not in seen:
                seen.add(url)
                final.append((url, title))
        return final

    # Try static first
    html = fetch_static_html(jobs_url)
    posts = parse_jobs_from_html(html, jobs_url) if html else []

    # Selenium fallback
    if not posts:
        html = fetch_rendered_html(jobs_url)
        posts = parse_jobs_from_html(html, jobs_url) if html else []

    posts = posts[:max_posts]
    urls = [p[0] for p in posts]
    titles = [p[1] for p in posts]

    # pad to max_posts length
    while len(urls) < max_posts:
        urls.append(None)
        titles.append(None)

    return urls, titles



In [8]:
df = pd.read_csv(INPUT_CSV)
print("Columns:", df.columns.tolist())
print("Total rows:", len(df))

# Ensure these columns exist in df
needed_cols = [
    "Careers Page URL", "Job listings page URL",
    "job post1 URL", "job post1 title",
    "job post2 URL", "job post2 title",
    "job post3 URL", "job post3 title"
]

for col in needed_cols:
    if col not in df.columns:
        df[col] = None


Columns: ['Startup', 'Website URL', 'Unnamed: 2', 'Careers Page URL', 'Job listings page URL', 'job post1 URL', 'job post1 title', 'job post2 URL', 'job post2 title', 'job post3 URL', 'job post3 title']
Total rows: 356


In [9]:
total = len(df)
print("Total companies:", total)

for i, row in df.iterrows():
    startup = row["Startup"]
    base_url = row["Website URL"]

    print(f"[{i+1}/{total}] Processing:", startup, "-", base_url)

    careers, jobs_page = find_careers_and_jobs(base_url)
    df.at[i, "Careers Page URL"] = careers
    df.at[i, "Job listings page URL"] = jobs_page

    if jobs_page:
        job_urls, job_titles = extract_job_posts(jobs_page)
        df.at[i, "job post1 URL"]   = job_urls[0]
        df.at[i, "job post1 title"] = job_titles[0]
        df.at[i, "job post2 URL"]   = job_urls[1]
        df.at[i, "job post2 title"] = job_titles[1]
        df.at[i, "job post3 URL"]   = job_urls[2]
        df.at[i, "job post3 title"] = job_titles[2]

    # small random delay to be polite
    time.sleep(random.uniform(1.0, 2.0))


Total companies: 356
[1/356] Processing: Thoughtful Foods - https://www.thoughtfulfood.co/
[2/356] Processing: Charzer - https://www.charzer.com/home
[3/356] Processing: Pilk - https://www.pilkfoods.com.au/
[4/356] Processing: Beano - https://beano.com.sg/
[5/356] Processing: RACEnergy - https://race.energy/
requests error for https://race.energy/ : HTTPSConnectionPool(host='race.energy', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002B16F8A2730>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Selenium error for https://race.energy/ : Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=142.0.7444.176)
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0xe04103
	0xe04144
	0xc0e71d
	0xc0bc77
	0xbfeec8
	0xc00884
	0xbff396
	0xbfec82
	0xbfe98d
	0xbfc7fe
	0xbfd28b
	0xc124ce
	0xc9fb37
	0xc7c90c
	0xc9ee17
	0xc7c706
	0xc4da30
	0xc4ed54
	0x10757b4
	0x

Selenium error for https://foodurama.co/ : Message: unknown error: net::ERR_SSL_PROTOCOL_ERROR
  (Session info: chrome=142.0.7444.176)
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0xe04103
	0xe04144
	0xc0e71d
	0xc0bc77
	0xbfeec8
	0xc00884
	0xbff396
	0xbfec82
	0xbfe98d
	0xbfc7fe
	0xbfd28b
	0xc124ce
	0xc9fb37
	0xc7c90c
	0xc9ee17
	0xc7c706
	0xc4da30
	0xc4ed54
	0x10757b4
	0x107098a
	0xe2c392
	0xe1c4c8
	0xe2324d
	0xe0c478
	0xe0c63c
	0xdf67ca
	0x765bfcc9
	0x77e382ae
	0x77e3827e

[53/356] Processing: climateXcapital - https://www.climatexcapital.com/
[54/356] Processing: Nasoya - nasoya.com
[55/356] Processing: Sustainable Foods (Plan*t Foods) - https://www.plan-t.earth/
[56/356] Processing: Emmay - https://emmay.vn/
requests error for https://emmay.vn/ : HTTPSConnectionPool(host='emmay.vn', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.

[100/356] Processing: Briyas - http://briyas.com/
[101/356] Processing: FoodGen - https://www.foodgen.co.nz/
requests error for https://www.foodgen.co.nz/ : HTTPSConnectionPool(host='www.foodgen.co.nz', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002B16EF01EE0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Selenium error for https://www.foodgen.co.nz/ : Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=142.0.7444.176)
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0xe04103
	0xe04144
	0xc0e71d
	0xc0bc77
	0xbfeec8
	0xc00884
	0xbff396
	0xbfec82
	0xbfe98d
	0xbfc7fe
	0xbfd28b
	0xc124ce
	0xc9fb37
	0xc7c90c
	0xc9ee17
	0xc7c706
	0xc4da30
	0xc4ed54
	0x10757b4
	0x107098a
	0xe2c392
	0xe1c4c8
	0xe2324d
	0xe0c478
	0xe0c63c
	0xdf67ca
	0x765bfcc9
	0x77e382ae
	0x77e3827e

[102/356] Processing: Plantelo - https://www.plantelo.in/
requests error for h

[159/356] Processing: Ampd Energy - https://www.ampd.energy/
[160/356] Processing: yumeat - https://www.yumeat.com/
[161/356] Processing: FemtoFarad - www.femtofarad.com
requests error for https://www.femtofarad.com : HTTPSConnectionPool(host='www.femtofarad.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002B16F057280>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Selenium error for https://www.femtofarad.com : Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=142.0.7444.176)
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0xe04103
	0xe04144
	0xc0e71d
	0xc0bc77
	0xbfeec8
	0xc00884
	0xbff396
	0xbfec82
	0xbfe98d
	0xbfc7fe
	0xbfd28b
	0xc124ce
	0xc9fb37
	0xc7c90c
	0xc9ee17
	0xc7c706
	0xc4da30
	0xc4ed54
	0x10757b4
	0x107098a
	0xe2c392
	0xe1c4c8
	0xe2324d
	0xe0c478
	0xe0c63c
	0xdf67ca
	0x765bfcc9
	0x77e382ae
	0x77e3827e

[162/356] Proces

[238/356] Processing: BackMarket - https://www.backmarket.com/en-us
[239/356] Processing: Protera - https://www.protera.com/
[240/356] Processing: Stora Enso - https://www.storaenso.com/en/
[241/356] Processing: Memodo - https://www.memodo-shop.com/
[242/356] Processing: Climate X - https://www.climate-x.com/
[243/356] Processing: Scatec Solar - https://scatec.com/portfolio/our-portfolio/solar/
[244/356] Processing: Centrica Energy - https://www.centrica.com/
[245/356] Processing: Enpal - https://www.enpal.com/
[246/356] Processing: ProVeg International - https://proveg.com/news/page/2/
[247/356] Processing: Renewable Energy Systems - https://www.res-group.com/us/
[248/356] Processing: CDP Global - https://www.cdp.net/en
[249/356] Processing: GreenFlux - https://www.greenflux.com/
requests error for https://www.greenflux.com/ : HTTPSConnectionPool(host='www.greenflux.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection obj

[344/356] Processing: BorgWarner - https://www.borgwarner.com/home
[345/356] Processing: Fluence - https://fluenceenergy.com/
[346/356] Processing: Bidgely - https://www.bidgely.com/
[347/356] Processing: XENERGY - https://x-energy.com/
[348/356] Processing: Key Capture Energy - https://keycaptureenergy.com/
[349/356] Processing: Sky Climber Renewables - https://skyclimber-re.com/
[350/356] Processing: World Wildlife Fund - https://www.worldwildlife.org/
[351/356] Processing: Enpowered - https://www.pmddtc.state.gov/ddtc_public
[352/356] Processing: Power Factors - https://www.powerfactors.com/
[353/356] Processing: Ecology Project International - https://www.ecologyproject.org/
[354/356] Processing: Enverus - https://www.enverus.com/
[355/356] Processing: FlexGen - https://www.flexgen.com/
[356/356] Processing: The Aspen Institute - https://www.aspeninstitute.org/


In [10]:
df.to_excel(OUTPUT_XLSX, index=False)
print("Saved final output to:", OUTPUT_XLSX)

# Close Selenium driver
driver.quit()


Saved final output to: GrowthForImpact_Assignment_Final_SarveshGoswami.xlsx
