In [1]:
# Importing useful libraries
import requests
from bs4 import BeautifulSoup
import time
import os
import asyncio
import aiohttp
import pandas as pd
import asyncio
from aiohttp import ClientSession, ClientResponseError

## 1.1. Get the list of master's degree courses

We start with the list of courses to include in your corpus of documents. In particular, we focus on web scrapping the MSc Degrees. Next, we want you to collect the URL associated with each site in the list from the previously collected list. The list is long and split into many pages. Therefore, we ask you to retrieve only the URLs of the places listed in the first 400 pages (each page has 15 courses, so you will end up with 6000 unique master's degree URLs).

The output of this step is a .txt file whose single line corresponds to the master's URL.

---
Let's use BeautifulSoup and Requests to scrape the links related to universities present on the first 400 pages of the following website:
'https://www.findamasters.com/masters-degrees/msc-degrees'.

Note that it's possible to navigate to different pages by modifying the link in the final part, appending "/?PG=" + the page number.

We are saving the URLs in each line of txt file called 'course_links'. In this case there aren't exceptions to handle with.

In [166]:
# let's create a file .txt called course_links which contains in every line the URL of the link

with open('course_links.txt', 'w') as file:
    # scraping first page
    response = requests.get('https://www.findamasters.com/masters-degrees/msc-degrees')
    soup = BeautifulSoup(response.text, 'html.parser')
    course_link = soup.find_all('a', class_='courseLink')
    for link in course_link:
        file.write(f"www.findamasters.com{link.get('href')}\n") # writing URLs in the txt file
        
    # if response.status_code == 200:
    for i in range(2, 401):
        # scraping pages from 2 to 400
        response = requests.get(f'https://www.findamasters.com/masters-degrees/msc-degrees/?PG={i}')
        soup = BeautifulSoup(response.text, 'html.parser')
        course_link = soup.find_all('a', class_='courseLink')
        # writing URL's in the txt file
        for link in course_link:
            file.write(f"www.findamasters.com{link.get('href')}\n")
        
        time.sleep(1)
        # adding a time.sleep of 1 second is important to avoid sending too many requests to the website

## 1.2. Crawl master's degree pages
Once you get all the URLs in the first 400 pages of the list, you:

1. Download the HTML corresponding to each of the collected URLs.
2. After you collect a single page, immediately save its HTML in a file. In this way, if your program stops for any reason, you will not lose the data collected up to the stopping point.
3. Organize the downloaded HTML pages into folders. Each folder will contain the HTML of the courses on page 1, page 2, ... of the list of master's programs.

**Tip**: Due to the large number of pages you should download, you can use some methods that can help you shorten the time. If you employed a particular process or approach, kindly describe it.



---
The proposed solution for this task involves creating 400 folders, with each folder dedicated to a web page that was scraped in the previous exercise. Inside each of these folders, the corresponding HTML contents of the 15 website will be stored.

The script is organized into three asynchronous functions:
- *get_info(url, session, folder, page_number)*, which performs an asynchronous HTTP GET request to a line (URL) of the txt file created in the exercise above. It returns the html page, written in a specific folder. It manages exceptions such as "Error 429: Too many requests to the website" and if it happens there's a time.sleep of 1 second, until a new get request is sent.
- *process_batch(urls_session, folder)*, which takes in input 15 urls and creates a list of asynchronous tasks, each corresponding to fetching HTML content from a URL in the given list using the *get_info* function, defined before. It uses asyncio.gather to concurrently execute all tasks and returns the results.
- *main(urls, batch_size, starting_folder)* which creates the path where all the html are put. It iterates through batches of URLs, creating a sub-folder for each batch and calling process_batch to asynchronously download and save HTML content for each URL in the batch.


The Python script is designed for asynchronous tasks using the aiohttp library to fetch HTML content from a list of URLs concurrently. 

In this particular case, working on multiple downloads at the same time are not so effective because the 6000 URLs are all from the same server, so we need to insert a time.sleep to handle "Too many requests". As a consequence, the code takes several hours to complete. To mitigate this, we introduced the *starting_folder* parameter in the main function. This allows us to resume the download process from where it left off, avoiding the need to recreate files and folders from scratch each time.

In [None]:
async def get_info(url, session, folder, page_number):
    try:
        resp = await session.request(method="GET", url="https://" + url)
        resp.raise_for_status()
        html = await resp.text(encoding='utf-8')
        
        # Create the path in the corresponding folder
        file_path = os.path.join(folder, f"page_{int(page_number)}.html")
        
        # Write the html page in the right folder
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(html)
        
        return html
    
    except ClientResponseError as e:
        if e.status == 429: # Error 429: too many requests.
            print(f"Received 429 error. Too many requests. Waiting for a while...")
            await asyncio.sleep(1)  # Wait for 1 second before retrying
            return await get_info(url, session, folder, page_number)
        else:
            raise e  # Re-raise other ClientResponseError
        
async def process_batch(urls, session, folder):
    tasks = [get_info(url, session, folder, page_number = count) for count, url in enumerate(urls, start=1)]
    return await asyncio.gather(*tasks)

async def main(urls, batch_size=15, starting_folder=0):
    main_directory = "master_programs_html"
    os.makedirs(main_directory, exist_ok=True)
    
    async with ClientSession() as session:
        count_folder = starting_folder + 1
        for i in range(starting_folder*batch_size, len(urls), batch_size):
            # selecting the URLs from urls variable
            batch_urls = urls[i:i + batch_size]
            # Creating a sub-folder for every batch
            folder_name = os.path.join(main_directory, f"folder_{count_folder}")
            os.makedirs(folder_name, exist_ok=True)
            count_folder += 1
            # Downloading and writing file HTML in the batch
            await process_batch(batch_urls, session, folder_name)

if __name__ == "__main__":
    with open('course_links.txt', 'r') as file:
        urls = [line.strip() for line in file] # creating a list with the 6000 URLs from the lines of course_links.txt
        
    result = await main(urls, starting_folder = 305) # starting_folder represents the folder we are starting from

    for text in result:
        pass # text contains your html (text) response
    print("Download and organization of HTML pages completed.")


## 1.3 Parse downloaded pages
At this point, you should have all the HTML documents about the master's degree of interest, and you can start to extract specific information. The list of the information we desire for each course and their format is as follows:

- Course Name (to save as courseName): string;
- University (to save as universityName): string;
- Faculty (to save as facultyName): string
- Full or Part Time (to save as isItFullTime): string;
- Short Description (to save as description): string;
- Start Date (to save as startDate): string;
- Fees (to save as fees): string;
- Modality (to save as modality):string;
- Duration (to save as duration):string;
- City (to save as city): string;
- Country (to save as country): string;
- Presence or online modality (to save as administration): string;
- Link to the page (to save as url): string.

For each master's degree, you create a course_i.tsv file of this structure:

        courseName \t universityName \t  ... \t url
If an information is missing, you just leave it as an empty string.

---
First things first let's create the empty dataframe with the variables described above.

In [22]:
# Name dataframe columns

columns = [
    "courseName",
    "universityName",
    "facultyName",
    "isItFullTime",
    "description",
    "startDate",
    "fees",
    "modality",
    "duration",
    "city",
    "country",
    "administration",
]

# Create a dataframe with the specific columns above
df = pd.DataFrame(columns=columns)

# Visualizza il DataFrame
df


Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration


Now we need to populate the dataframe, opening every html page from every folder created in the exercise 1.2.

After this operation we have to find specific elements in the html page, handling exceptions, for example if there's no matching element with the *find_all* function, variable will be "".

For every row of the dataframe we are also creating a .tsv file named as the master degree, containing the corresponding variables values.

In [None]:
# Populate the dataframe
path = "master_programs_html"
os.mkdir("files_tsv")
count = 0
for folder in range(1, 401): # loop for every folder
    for file in range(1, 16): # loop for every file
        file_path = os.path.join(path, f"folder_{folder}", f"page_{file}.html")
        with open(file_path, 'r', encoding='utf-8', errors='replace') as fl:
            soup = BeautifulSoup(fl, 'html.parser')
            courseName = soup.find("h1", {"class": "course-header__course-title"}).get_text(strip = True)
            universityName = soup.find("a", {"class": "course-header__institution"}).get_text(strip = True)
            facultyName = soup.find("a", {"class": "course-header__department"}).get_text(strip = True)
            extract = soup.find("span", {"class": "key-info__study-type"})
            if extract is None:
                isItFullTime = ""
            else:
                isItFullTime = extract.get_text(strip = True)
            description = soup.find("div", {"class": "course-sections__description"}).find("div", {"class": "course-sections__content"}).get_text(strip = True)
            startDate = soup.find("span", {"class": "key-info__start-date"}).get_text(strip = True)
            # some entries do not have this field
            extract = soup.find("div", {"class": "course-sections__fees"})
            if extract is None:
                fees = ""
            else:
                fees = extract.find("div", {"class": "course-sections__content"}).get_text(strip = True)
            modality = soup.find("span", {"class": "key-info__qualification"}).get_text(strip = True)
            duration = soup.find("span", {"class": "key-info__duration"}).get_text(strip = True)
            city = soup.find("a", {"class": "course-data__city"}).get_text(strip = True)
            country = soup.find("a", {"class": "course-data__country"}).get_text(strip = True)
            extract1 = soup.find("a", {"class": "course-data__online"})
            extract2 = soup.find("a", {"class": "course-data__on-campus"})
            if extract1 is None and extract2 is None:
                administration = ""
            elif extract2 is None:
                administration = extract1.get_text(strip = True)
            elif extract1 is None:
                administration = extract2.get_text(strip = True)
            else:
                administration = extract1.get_text(strip = True) + " & " + extract2.get_text(strip = True)

            new_row_data = {"courseName": courseName,
                            "universityName": universityName,
                            "facultyName": facultyName,
                            "isItFullTime": isItFullTime,
                            "description": description,
                            "startDate": startDate,
                            "fees": fees,
                            "modality": modality,
                            "duration": duration,
                            "city": city,
                            "country": country,
                            "administration": administration,
                            }
            df = pd.concat([df, pd.DataFrame([new_row_data])], ignore_index=True)
            courseName = courseName.replace('|', '_').replace('&', '_').replace('<', '_').replace('>', '_').replace(',', '_').replace('-', '_').replace('.', '_').replace('\'', '_').replace('/', '_')
            if courseName[-1] == " ":
                courseName = courseName[:-1]
            file_path = os.path.join('files_tsv', f'{courseName}.tsv')
            pd.DataFrame([new_row_data]).to_csv(file_path, sep='\t', index=False)
            count += 1            
            print(f"Created {courseName}.tsv, {count}/6000")

