In [21]:
import requests
from bs4 import BeautifulSoup
import json


In [22]:
# Catalog URL
base_url = "https://www.eacea.ec.europa.eu/scholarships/erasmus-mundus-catalogue_en"

# Function to get the links from a specific page
def get_program_links(page_number):
    # Make the request to the Erasmus Mundus catalogue page (change the page number in the URL)
    url = f"{base_url}?page={page_number}"
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        program_links = soup.find_all('a', href=True)  # Finds all <a> tags with href attributes
        links = [a['href'] for a in program_links if "http" in a['href'] and "europa" not in a['href']]  # Adjust filter if needed
        return links
    else:
        print(f"Failed to retrieve page {page_number}")
        return []

# Retrieve links from page 10 and 11
page_10_links = get_program_links(9)
page_11_links = get_program_links(10)

# Print the retrieved links
print("Links from Page 10:")
for link in page_10_links:
    print(link)

print("\nLinks from Page 11:")
for link in page_11_links:
    print(link)


Links from Page 10:
http://se4gd.eu/
http://www.master-seas40.unina.it/
https://secclo.eu/
https://sinrem.eu/
https://ssi-master.eu/
http://www.emjmdsteps.eu/
https://master-strains.eu/
https://www.master-sucat.eu/
http://www.emm-nano.org/
https://mundusjournalism.com/
https://www.tise-master.eu/
http://emissmaster.omu.edu.tr/
http://www.emhrpp.com/
https://master-waves.eu/
https://we-team.education/
https://www.eusmat.net/international-studies/master/amase/
http://www.nas.boku.ac.at/nuwi/emabg/
https://www.eimas.uni-bayreuth.de/en/
https://www.analyticalchemistry.eu/
https://www.docnomads.eu/

Links from Page 11:
https://cosi-master.eu/
https://www.master-cne.eu/
https://www.cle.unibo.it/
https://www.jointdegree.eu/de/circle-erasmus-mundus-international-masters-programme-on-circular-economy/
https://erasmusmundus-ceeres.eu/
https://master-bioref.eu
https://master-biopham.eu/
https://www.bioceb.eu/
https://bimaplus.org/
http://www.beinprecisionmedicine.org/
https://bdma.ulb.ac.be/bdma/

In [23]:
program_links = page_10_links+page_11_links

In [67]:
from urllib.parse import urljoin
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

i = 1
programs_json = {}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}


title_prefixes = ("Erasmus Mundus Master", "International MSc", "International Master")

def extract_titles(soup):
    """Extracts all relevant titles from <title>, <h1>, <h2>, or <h3>."""
    titles = []

    # Extract <title> tag text
    title_tag = soup.find("title")
    if title_tag and title_tag.text.strip():
        titles.append(title_tag.text.strip().replace('\xa0',' ').replace('Home - ',''))

    # Extract <h1>, <h2>, <h3> that start with the given prefixes
    for tag in ["h1", "h2", "h3"]:
        for heading in soup.find_all(tag):
            heading_text = heading.text.strip()
            for prefix in title_prefixes:
                if prefix.lower() in heading_text.lower():                
                    titles.append(heading_text.replace('\xa0',' ').replace('Home - ',''))

    return list(set(titles))  # Remove duplicates

for link in program_links:
    try:
        # programs_json[f"p{i}"] = {}
        # programs_json[f"p{i}"]["url"] = link
        # programs_json[f"p{i}"]["pages"] = []

        programs_json[f"p{i}"] = {
            "url": link,
            "pages": []
        }

        page = requests.get(link, headers=headers, timeout=10)  # Set timeout to avoid hanging
        page.raise_for_status()  # Raise error for HTTP issues (e.g., 404, 500)
        soup = BeautifulSoup(page.content, "html.parser")

        program_titles = extract_titles(soup)
        programs_json[f"p{i}"]["titles"] = program_titles

        canonical_tag = soup.find("link", rel="canonical")
        if canonical_tag and canonical_tag.get('href'):
            program_base = urljoin(link, canonical_tag['href'])
        else:
            program_base = link  # Fallback to the original link if no canonical tag

        programs_json[f"p{i}"]["pages"].append({"url": program_base,
        "html":BeautifulSoup(requests.get(program_base, headers=headers).text, "html.parser").prettify()
        })

        sub_links = soup.find_all("a", href=True)
        
        for sub_link in sub_links:
            full_sub_link_url = urljoin(link, sub_link['href'])
            # program_base_without_www = program_base.replace('www.', '')
            parsed_url = urlparse(program_base)
            domain = parsed_url.netloc
            normalized_program_base = domain.replace('www.', '')
            if normalized_program_base in full_sub_link_url and 'http' in full_sub_link_url and not any(d["url"] == full_sub_link_url for d in programs_json[f"p{i}"]["pages"]):
                programs_json[f"p{i}"]["pages"].append({"url": full_sub_link_url,         
                "html": BeautifulSoup(requests.get(full_sub_link_url, headers=headers).text, "html.parser").prettify()
        })

        filename = f"program_p{i}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump( programs_json[f"p{i}"], f, indent=4, ensure_ascii=False)

        print(f"Saved: {filename}")  # Indicate progress

    except (requests.RequestException, ValueError) as e:
        print(f"Skipping {link} due to error: {e}")
        continue  # Skip this site and move to the next one

    i += 1


Saved: program_p1.json
Saved: program_p2.json
Saved: program_p3.json
Saved: program_p4.json
Saved: program_p5.json
Skipping http://www.emjmdsteps.eu/ due to error: HTTPSConnectionPool(host='www.emjmdsteps.eu', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: UNSAFE_LEGACY_RENEGOTIATION_DISABLED] unsafe legacy renegotiation disabled (_ssl.c:1020)')))
Saved: program_p6.json
Saved: program_p7.json
Saved: program_p8.json
Saved: program_p9.json
Saved: program_p10.json
Saved: program_p11.json
Saved: program_p12.json
Saved: program_p13.json
Saved: program_p14.json


  k = self.parse_starttag(i)


Saved: program_p15.json
Saved: program_p16.json
Saved: program_p17.json
Saved: program_p18.json
Saved: program_p19.json
Saved: program_p20.json
Saved: program_p21.json
Saved: program_p22.json
Skipping https://www.jointdegree.eu/de/circle-erasmus-mundus-international-masters-programme-on-circular-economy/ due to error: 404 Client Error: Not Found for url: https://www.jointdegree.eu/de/circle-erasmus-mundus-international-masters-programme-on-circular-economy/
Saved: program_p23.json
Saved: program_p24.json
Saved: program_p25.json
Saved: program_p26.json
Saved: program_p27.json
Saved: program_p28.json
Saved: program_p29.json
Saved: program_p30.json
Saved: program_p31.json
Saved: program_p32.json


In [68]:
# Iterate over the programs_json to get all URLs from the "pages" key
all_urls=[]

for program_key, program_data in programs_json.items():
    test=[]
    test.append(program_data["titles"])
    # Extract URLs from the "pages" list
    for page in program_data["pages"]:
        test.append(page["url"])
    all_urls.append(test)
    

In [71]:
for i in all_urls:
    print(i[0])

['SE4GD – Software engineering with a purpose']
['Master Seas 4.0 – Università degli Studi di Napoli Federico II']
[]
['SINReM – International Master of Science', 'International MSc in Sustainable and Innovative Natural Resource Management']
['Joint International Master in Smart Systems Integrated Solutions', 'Joint International Master in Smart Systems Integrated Solutions (SSIs) - SSIs', 'Joint International Master in Smart Systems Integrated Solutions (SSIs)']
['Master Erasmus Mundus STRAINS Université de Lille']
['Erasmus Mundus Master in Sustainable Catalysis', 'Master SuCat – Université de Poitiers']
['EMM Nano | Erasmus Mundus', 'Erasmus Mundus Master Nanoscience and Nanotechnology']
["Erasmus Mundus Master's in Journalism, Media and Globalisation"]
['TISE']
['The Erasmus Mundus Master in Soil Science (emiSS)']
['Erasmus Mundus - Home']
['ERASMUS MUNDUS MASTER WAVES', 'Master Waves | Home']
['WE-TEAM']
['AMASE - European School of Materials | EUSMAT | Saarland University']
['Eur

In [73]:
# all_urls