In [2]:
import requests
from bs4 import BeautifulSoup
import json


In [3]:
# This script scrapes the Erasmus Mundus catalogue for program links from pages 6 to 11.

# Catalog URL
base_url = "https://www.eacea.ec.europa.eu/scholarships/erasmus-mundus-catalogue_en"

# Function to get the links from a specific page
def get_program_links(page_number):
    url = f"{base_url}?page={page_number}"
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        program_links = soup.find_all('a', href=True)
        links = [a['href'] for a in program_links if "http" in a['href'] and "europa" not in a['href']]
        return links
    else:
        print(f"Failed to retrieve page {page_number}")
        return []

# Collect links from pages 6 to 11 (indexes 5 to 10)
program_links = []
for i in range(5, 11):
    links = get_program_links(i)
    program_links += links  # or use program_links.extend(links)

# Print all collected links
print("All Program Links from Pages 6 to 11:")
for link in program_links:
    print(link)


All Program Links from Pages 6 to 11:
http://www.master-goals.eu/
https://www.coasthazar.eu
https://www.ntnu.edu/studies/mscomem
https://cartographymaster.eu/
https://www.master-promise.eu/
https://www.tpti.eu/en/
https://emildai.eu
https://islandsmaster.eu/
http://www.quanteem.eu
https://emjm-sbbe.eu
http://www.agrifoodmaster.eu/
https://www.eu-conexus.eu/en/marine-biotechnology/
https://masterchemoinfo.u-strasbg.fr/
https://transnationalgermanstudies.eu/en/programme/
http://www.master-quarmen.eu/
http://www.mbuild.eu/
https://www.master-mass.eu/
https://www.amir-em-master.com/
https://www.emmir.org/
http://sustainabledrugdiscovery.eu
https://mastergeotech.info/
https://cyberus.univ-ubs.fr/en/index.html
https://master-dream.ec-nantes.fr/
https://www.emmie.uliege.be/cms/c_7950389/en/emmie
https://www.imete.eu/
https://msc-frp.org/
http://www.cyber-t.eu/
https://www.master-egei.eu/
https://www.master-ediss.eu
https://choreomundus.org/
https://www.emmcchir.org/
https://www.emcl.eu/
http:

In [4]:
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import json

i = 1
programs_json = {}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

title_prefixes = ("Erasmus Mundus Master", "International MSc", "International Master")

def extract_titles(soup):
    titles = []
    title_tag = soup.find("title")
    if title_tag and title_tag.text.strip():
        titles.append(title_tag.text.strip().replace('\xa0',' ').replace('Home - ',''))

    for tag in ["h1", "h2", "h3"]:
        for heading in soup.find_all(tag):
            heading_text = heading.text.strip()
            for prefix in title_prefixes:
                if prefix.lower() in heading_text.lower():                
                    titles.append(heading_text.replace('\xa0',' ').replace('Home - ',''))
    return list(set(titles))

for link in program_links:
    try:
        programs_json[f"p{i}"] = {
            "url": link,
            "pages": []
        }

        response = requests.get(link, headers=headers, timeout=10)
        response.raise_for_status()

        if "text/html" not in response.headers.get("Content-Type", ""):
            print(f"Skipping non-HTML link: {link}")
            continue

        soup = BeautifulSoup(response.content, "html.parser")
        program_titles = extract_titles(soup)
        programs_json[f"p{i}"]["titles"] = program_titles

        canonical_tag = soup.find("link", rel="canonical")
        program_base = urljoin(link, canonical_tag['href']) if canonical_tag and canonical_tag.get('href') else link

        # Add base page
        programs_json[f"p{i}"]["pages"].append({
            "url": program_base,
            "html": BeautifulSoup(requests.get(program_base, headers=headers).content, "html.parser").prettify()
        })

        # Add subpages
        sub_links = soup.find_all("a", href=True)
        parsed_url = urlparse(program_base)
        domain = parsed_url.netloc.replace('www.', '')

        for sub_link in sub_links:
            full_sub_link_url = urljoin(link, sub_link['href'])
            if domain in full_sub_link_url and 'http' in full_sub_link_url and not any(d["url"] == full_sub_link_url for d in programs_json[f"p{i}"]["pages"]):
                try:
                    sub_response = requests.get(full_sub_link_url, headers=headers, timeout=10)
                    sub_response.raise_for_status()

                    if "text/html" in sub_response.headers.get("Content-Type", ""):
                        html = BeautifulSoup(sub_response.content, "html.parser").prettify()
                        programs_json[f"p{i}"]["pages"].append({
                            "url": full_sub_link_url,
                            "html": html
                        })
                    else:
                        print(f"Skipped non-HTML subpage: {full_sub_link_url}")
                except Exception as sub_e:
                    print(f"Error fetching subpage {full_sub_link_url}: {sub_e}")
                    continue

        # Generate domain-based filename
        parsed_domain = urlparse(link).netloc.replace("www.", "").replace(".", "_")
        filename = f"program_{parsed_domain}.json"

        with open(filename, "w", encoding="utf-8") as f:
            json.dump(programs_json[f"p{i}"], f, indent=4, ensure_ascii=False)

        print(f"Saved: {filename}")

    except (requests.RequestException, ValueError) as e:
        print(f"Skipping {link} due to error: {e}")
        continue

    i += 1


Saved: program_master-goals_eu.json
Saved: program_coasthazar_eu.json
Skipped non-HTML subpage: https://www.ntnu.edu/documents/1310180250/0/Comem%2B_Brochure+202324.pdf/a3c2894e-1701-d06e-03cc-5cc8eaffaf85?t=1697447978748
Error fetching subpage https://www.ntnu.edu/c/portal/login?p_l_id=1311043821: 403 Client Error: Forbidden for url: https://www.ntnu.edu/c/portal/login?p_l_id=1311043821
Saved: program_ntnu_edu.json
Error fetching subpage https://cartographymaster.eu/thesis: 404 Client Error: Not Found for url: https://cartographymaster.eu/thesis
Skipped non-HTML subpage: http://cartographymaster.eu/wp-content/documents/CARTOGRAPHY_Student_Handbook.pdf
Saved: program_cartographymaster_eu.json
Saved: program_master-promise_eu.json
Error fetching subpage https://www.tpti.eu/en/presentation/partners/universities.html: 404 Client Error: Not Found for url: https://www.tpti.eu/en/presentation/partners/universities.html
Error fetching subpage https://www.tpti.eu/en/presentation/partners/cultu

In [5]:
# Iterate over the programs_json to get all URLs from the "pages" key
all_urls=[]

for program_key, program_data in programs_json.items():
    test=[]
    test.append(program_data["titles"])
    # Extract URLs from the "pages" list
    for page in program_data["pages"]:
        test.append(page["url"])
    all_urls.append(test)
    

In [6]:
for i in all_urls:
    print(i[0])

['Master GOALS - GOvernance & Administration of Leisure and Sports International Master']
['Erasmus Mundus | CoastHazar']
['Coastal and Marine Engineering and Management (MSCOMEMPLUS) - NTNU']
['Welcome to the Cartography M.Sc. programme']
['EMJM PROMISE – Erasmus Mundus Joint Master in Sustainable Mineral and Metal Processing Engineering']
['Master Techniques, Heritage, Territories of Industry - Erasmus Mundus+']
['EMILDAI']
['ISLANDS MSc – Islands and Sustainability']
['AN ERASMUS MUNDUS MASTER IN QUANTUM TECHNOLOGIES AND ENGINEERING', 'QUANTEEM, An engineering erasmus mundus master.']
['EMJM-SBBE – Sustainable Biomass and Bioproducts Engineering']
['DANUBE AGRIFOOD MASTER – JOINT MASTER PROGRAM - Home']
['Joint Master Programme in Marine Biotechnology - EU-CONEXUS']
['Erasmus Mundus Joint Master - ChEMoinformatics+ Unique in Europe']
['Programme - Transnational German Studies']
['Erasmus Mundus Quarmen Master', "International Master's programme in Quantum Science and Technology"]
[]

In [7]:
# all_urls