In [1]:
from bs4 import BeautifulSoup
import requests
import csv 
import os


In [2]:
def get_page_contents(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    }

    page = requests.get(url, headers=headers)

    if page.status_code == 200:
        return page.text

    return None

In [3]:
def get_masters_CCA(page_contents):
    soup = BeautifulSoup(page_contents, 'html.parser')
    structured_content = {}
    
    # Find the main div containing the sections
    parcours_compta_div = soup.find('div', id='elementor-tab-content-2011', class_='elementor-tab-content elementor-clearfix')
    
    if parcours_compta_div:
        # Extract headings and their corresponding content
        headings = parcours_compta_div.find_all('h3')
        for heading in headings:
            key_text = heading.get_text(strip=True)
            values = []
            
            # Special case for "Matières par semestre"
            if key_text == 'Matières par semestre':
                semester_content = {}
                # Locate each semester section
                semesters = parcours_compta_div.find_all('div', class_='elementor-accordion-item')
                for semester in semesters:
                    semester_heading = semester.find('a').get_text(strip=True)  # e.g., "Semestre 1"
                    semester_content_list = semester.find('p')  # The paragraph contains the semester details
                    if semester_heading and semester_content_list:
                        materials = semester_content_list.get_text(strip=True).splitlines()  # Split into a list of lines
                        semester_content[semester_heading] = materials
                structured_content[key_text] = semester_content
            elif(key_text == 'Contenu'):
                values = []
                for sibling in heading.find_next_siblings(['ul'], limit=1):
                    values.extend([li.get_text(strip=True) for li in sibling.find_all('li')])
                structured_content[key_text] = values
            else:
                # For other sections like "Objectifs," "Contenu," etc.
                for sibling in heading.find_next_siblings(['p', 'ul'], limit=3):  # Limit to avoid unrelated content
                    if sibling.name == 'p':
                        values.append(sibling.get_text(strip=True))
                    elif sibling.name == 'ul':
                        values.extend([li.get_text(strip=True) for li in sibling.find_all('li')])
                structured_content[key_text] = values
    
    return structured_content


In [4]:
if __name__ == '__main__':
    url = 'https://www.esb.tn/programmes/masters-professionnels/master-professionnel-de-comptabilite-controle-audit/'
    page_contents = get_page_contents(url)
    result=[]   #keep result when appending all of the programs for now we will try to use it for csv test file

    if page_contents:
        result.append(get_masters_CCA(page_contents))
        #print(get_masters_CCA(page_contents))
    else:
        print('Failed to get page contents.')

In [5]:
field_names = [
    'Objectifs', 'Contenu', 'Compétences', 'Métiers', 'Secteurs d’activité', 'Partenariats professionnels',
    'Semestre 1', 'Semestre 2', 'Semestre 3', 'Semestre 4'
]

# Flatten the data
flattened_result = {}
for dict in result:
    for key, value in dict.items():
        if key == 'Matières par semestre':
            # Extract each semester as a separate field
            for semester, courses in value.items():
                flattened_result[semester] = ', '.join(courses)
        elif isinstance(value, list):
            # Join list items with commas
            flattened_result[key] = ', '.join(value)
        else:
            flattened_result[key] = value
with open('./masters.csv', 'a', newline='', encoding='utf-8') as csvfile:
    # Check if the file is empty
    file_is_empty = os.stat('./masters.csv').st_size == 0

    # Create a DictWriter object
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    
    # Write the header only if the file is empty
    if file_is_empty:
        writer.writeheader()
    
    # Write the flattened result as a row
    writer.writerow(flattened_result)

In [6]:
import json
import os

# Write to JSON
file_path = './masters.json'

# Check if the file exists
file_exists = os.path.isfile(file_path)

# Read existing data if the file exists
if file_exists:
    with open(file_path, 'r', encoding='utf-8') as jsonfile:
        prev_data = json.load(jsonfile)
else:
    prev_data = []

# Append the new flattened result
prev_data.append(flattened_result)
print(prev_data)
# Write the updated data back to the JSON file
with open(file_path, 'w', encoding='utf-8') as jsonfile:
    json.dump(prev_data, jsonfile, ensure_ascii=False, indent=4)

[{'Objectifs': 'Le Master Professionnel en Business Analytics est co-construit avec l’entreprise « Business & Decision Tunisie ». Il est au croisement de la Data Science, de la Business Intelligence et du Management. Le programme vise à former des experts capables de développer des solutions décisionnelles analytiques.\nLe Master Professionnel en Business Analytics est disponible en double diplôme avec le Master Intelligence Artificielle & Business transformation avec PSTB.', 'Contenu': 'DATA SCIENCE ET INTELLIGENCE ARTIFICIELLE : Machine Learning, Deep Learning, Data Mining, Time Series, Statistical Analysis, Data Visualization.\nINFORMATIQUE : Big Data, Business Intelligence, Data Bases, Programmation\nMANAGEMENT : Business Process Management, E-business, Management Stratégique, Management de l’innovation.\nSOFT SKILLS\xa0 : Design Thinking, Séminaires, Développement Personnel, Langues.\nPROJET PROFESSIONNEL : Stages, PFE, Projets Intégrés, Visites d’entreprises, etc.', 'Compétences'