In [1]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
from bs4 import BeautifulSoup
import json
import requests
import os

In [3]:
class ProgramScraper:
    def __init__(self, base_url):
        self.base_url = base_url

    def scrape_multiple_programs(self, program_ids):
            results = []
            for program_id in program_ids:
                print(f"Scraping details for Program ID: {program_id}")
                details = self.scrape_program_details(program_id)
                results.append(details)
            return results

    def scrape_program_details(self, program_id):
        url = self.base_url.format(id=program_id)

        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract Description/Content
            description = soup.find("dt", class_="c-description-list__content", string="Description/content")
            description_text = description.find_next("dd").get_text(strip=True) if description else ""

            # Extract Course Organisation
            course_org = soup.find("dt", class_="c-description-list__content", string="Course organisation")
            course_org_text = course_org.find_next("dd").get_text(strip=True) if course_org else ""

            return {
                "id": program_id,
                "Description/content": description_text,
                "Course Organisation": course_org_text
            }
        except requests.RequestException as e:
            print(f"Error fetching data from API: {e}")
            return None

    def save_to_json(self, data, file_path):

        os.makedirs(os.path.dirname(file_path), exist_ok= True)

        with open(file_path, "w", encoding="utf-8") as file:
            json.dump(data, file, indent=4, ensure_ascii=False)
        print(f"Data saved to {file_path}")

In [4]:
if __name__ == "__main__":
    # Base URL for scraping
    BASE_URL = "https://www2.daad.de/deutschland/studienangebote/international-programmes/en/detail/{id}/"

    # List of program IDs
    PROGRAM_IDS = [
    8305, 4439, 4870, 5616, 4455, 4591, 7606, 8324, 9856, 3629,
    4249, 4238, 5551, 8935, 3736, 8900, 5245, 7670, 7744, 3981,
    4700, 3624, 9042, 7618, 9673, 8936, 4490, 4842, 4239, 8440,
    3827, 4092, 9725, 4686, 4660, 3727, 5202, 3903, 9040, 6236,
    6296, 8953, 8353, 7124, 5576, 4521, 4384, 7708, 3696, 7658,
    6529, 7636, 7660, 3960, 4556, 9077, 9691, 3739, 8429, 4350,
    5253, 9858, 6937, 4001, 3877, 7641, 9046, 4581, 5252, 4667,
    4886, 5453, 4874, 4427, 3614, 4653, 7784, 4258, 9737, 3667,
    8382, 9198, 4343, 7054, 8977, 8518, 3686, 4728, 9770, 9643,
    4879, 9776, 4553, 9704, 4775, 8902, 9001, 8417, 9228, 4513,
    4225, 4429, 4247, 6140, 9773, 7852, 3632, 7657, 4122, 9841,
    9709, 9679, 7711, 5561, 6924, 9863, 9646, 3803, 9072, 3728,
    4655, 9003, 9223, 3737, 8944, 3923, 7040, 3841, 4634, 3799,
    7795, 6119, 9043, 5634, 3862, 9838, 4600, 6262, 4407, 9595,
    3724, 6981
]

    # Initialize the scraper
    scraper = ProgramScraper(BASE_URL)

    # Scrape details for all program IDs
    scraped_data = scraper.scrape_multiple_programs(PROGRAM_IDS)

    # Save the scraped data to a JSON file
    scraper.save_to_json(scraped_data, "../datasets/program_details.json")

Scraping details for Program ID: 8305
Scraping details for Program ID: 4439
Scraping details for Program ID: 4870
Scraping details for Program ID: 5616
Scraping details for Program ID: 4455
Scraping details for Program ID: 4591
Scraping details for Program ID: 7606
Scraping details for Program ID: 8324
Scraping details for Program ID: 9856
Scraping details for Program ID: 3629
Scraping details for Program ID: 4249
Scraping details for Program ID: 4238
Scraping details for Program ID: 5551
Scraping details for Program ID: 8935
Scraping details for Program ID: 3736
Scraping details for Program ID: 8900
Scraping details for Program ID: 5245
Scraping details for Program ID: 7670
Scraping details for Program ID: 7744
Scraping details for Program ID: 3981
Scraping details for Program ID: 4700
Scraping details for Program ID: 3624
Scraping details for Program ID: 9042
Scraping details for Program ID: 7618
Scraping details for Program ID: 9673
Scraping details for Program ID: 8936
Scraping det