<a href="https://colab.research.google.com/github/sriku2412/schulich_data_science_2/blob/main/webscrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd

def fetch_data(base_url: str, total_pages: int, endpoint: str, headers: dict = {}) -> list:
    data_list = []
    for page_number in range(total_pages + 1):
        api_url = f"{base_url}{page_number}{endpoint}"
        response = requests.get(api_url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            data_list.extend(data.get("hits", []))
        else:
            print(f"Error: Unable to fetch data for page {page_number} (Status Code: {response.status_code})")
    return data_list

def save_to_csv(data: list, headers: list, filename: str) -> None:
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        writer.writerows(data)

def main():
    base_url = "https://www1.communitech.ca/api/search/companies?networkId=628&hitsPerPage=12&page="
    total_company_pages = 140
    company_data = fetch_data(base_url, total_company_pages, "")
    company_headers = ["name", "description", "locations", "logo_url", "topics", "industry_tags", "stage", "head_count", "active_jobs_count"]
    save_to_csv(company_data, company_headers, 'companies.csv')

    base_url = "https://www1.communitech.ca/api/search/jobs?networkId=628&hitsPerPage=20&page="
    total_job_pages = 298
    job_data = fetch_data(base_url, total_job_pages, "&filters=&query=")
    job_headers = ["created_at", "locations", "organization_id", "organization_name", "organization_logo_url", "organization_slug", "organization_topics", "organization_industry_tags", "organization_stage", "organization_head_count", "source", "slug", "title", "url", "featured", "has_description"]
    save_to_csv(job_data, job_headers, 'jobs.csv')

    jobs = pd.read_csv("jobs.csv")
    jobs.columns = ['created_at', 'locations', 'organization_id', 'organization_name', 'organization_logo_url', 'organization_slug', 'organization_topics', 'organization_industry_tags', 'organization_stage', 'organization_head_count', 'source', 'slug', 'title', 'url', 'featured', 'has_description']
    jobs['url'] = 'https://www1.communitech.ca/companies/' + jobs['organization_slug'] + '/jobs/' + jobs['slug'] + '#content'
    jobs = jobs.drop_duplicates()
    jobs = jobs[jobs['has_description'] == True]
    jobs = jobs[['organization_name', 'title', 'locations', 'organization_topics', 'organization_industry_tags', 'url', 'has_description']]

    for idx, url in enumerate(jobs['url']):
        response = requests.get(url)
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        career_page_div = soup.find('div', {'data-testid': 'careerPage'})
        jobs.at[idx, 'description'] = str(career_page_div) if career_page_div else ''

    jobs.to_csv('jobs_info.csv', index=False)

if __name__ == "__main__":
    main()