In [416]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [490]:
def extract(url):
    """
        This function takes a Coursera certificates url and calls the to_pandas()
        on every certificate in the link
    """
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    
    main_frame = soup.find("main")
    
    certificates = main_frame.find("ul", {"id": ['professional-certificates']}).find_all("li")
    
    certificates_df = get_certificates(certificates)
    courses_df = get_courses(certificates_df.head(5))
    return certificates_df, courses_df
    

In [491]:
def get_certificates(certificates_frame):
    """
        This function receives a list of certificates and it scrapes details from
        every certificate and makes a pandas dataframe.
    """
    
    df = pd.DataFrame(columns=["Name", "Offered by", "Duration", "Course Link"])
    for certificate in certificates_frame:
        details = certificate.find_all("p")
        name = details[0].string
        offered_by = details[1].string[11:]
        duration = details[2].string
        link = certificate.find("a").attrs["href"]
        certificate_dict = {"Name": name, "Offered by": offered_by, "Duration": duration, "Course Link": link}
        df = df.append(certificate_dict, ignore_index=True)
    
    return df

In [476]:
def get_courses(extracted_certificates):
    from selenium import webdriver
    from selenium.webdriver.common.by import By

    courses_dataframe = pd.DataFrame(columns=["Name", "Rating", "Number of ratings", "Description", "Course Link", "Certification"]) # It will also have a specialization column 
    for i in range(extracted_certificates.shape[0]):
        base_url = "https://www.coursera.org"
        url = base_url + extracted_certificates.iloc[i]["Course Link"]
        
        # Block images via ChromeOptions object
        chrome_options = webdriver.ChromeOptions()
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_options.add_experimental_option("prefs", prefs)
        chrome_options.add_argument('--headless')

        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        
        try:
            button = driver.find_element(By.CLASS_NAME, '_3hmsj').find_element(By.CLASS_NAME, "_l3lqkfz")
            button.click()
        except:
            pass
        
        text = driver.page_source
        soup = BeautifulSoup(text, 'html.parser')
        certificate_name = extracted_certificates.iloc[i]["Name"]
        certificate_courses = courses_details(soup, certificate_name)
        
        courses_dataframe = courses_dataframe.append(certificate_courses, ignore_index=True)
        driver.quit()
        
    return courses_dataframe

In [473]:
def courses_details(certification, certification_name):
    courses = certification.find_all("div", {"class": ["CourseItem"]})
    
    certificate_courses_df = pd.DataFrame(columns=["Name", "Rating", "Number of ratings", "Description", "Course Link", "Certification"])
    for course in courses:
        name = course.find("h3").string
        try:
            rating = course.find(
            "span", {"class": ['rating-text']}).findChildren(text=True)[0]
        except:
            rating = None
        number_of_ratings = course.find(
            "span", {"class": ['rating-text']}).nextSibling()[0].text
        description = course.find("p").string
        course_link = course.find("a", {"data-e2e": "course-link"}).attrs["href"]
        course_dict = {
            "Name": name, 
            "Rating": rating, 
            "Number of ratings": number_of_ratings, 
            "Description": description, 
            "Course Link": course_link,
            "Certification": certification_name}
        
        certificate_courses_df = certificate_courses_df.append(course_dict, ignore_index=True)
        
    return certificate_courses_df

In [529]:
def transform_certificates(extracted_certificates):
    # Extracting Months and required hours from duration
    extracted_certificates["Duration (Months)"] = extracted_certificates["Duration"].str.split(" ").apply(
    lambda x: x[0] if x!=None else None)
    extracted_certificates["Required effort (Hours)"] = extracted_certificates["Duration"].str.split(" ").apply(
    lambda x: x[x.index('hours')-1] if x!=None else None)
    # Dropping the Duration column
    del extracted_certificates["Duration"]
    
    # Changing the course links
    base_url = "https://www.coursera.org"
    extracted_certificates["Course Link"] = base_url + extracted_certificates["Course Link"] 
    
    return extracted_certificates

In [503]:
def transform_courses(extracted_courses):
    try:
        extracted_courses["Number of ratings"] = extracted_courses["Number of ratings"].str.split(" ").apply(
        lambda x: x[0])
    except:
        pass
    
    return extracted_courses

In [498]:
def transform(extracted_certificates, extracted_courses):
    """
        This function takes the extracted pandas dataframe and transforms it
    """
    transformed_certificates = transform_certificates(extracted_certificates)
    transformed_courses = transform_courses(extracted_courses)
    
    return transformed_certificates, transformed_courses

In [470]:
def load(data, target):
    data.to_csv(target, index=False)

In [471]:
from datetime import datetime

def log(message):
    timestamp_format = "%Y-%h-%d-%H:%M:%S"
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    
    with open("logs.txt", "a") as file:
        file.write(message + " at " + timestamp + "\n") 

# ETL Process

In [533]:
log("ETL Process started")

before = datetime.now()
log("Extract phase started")
extracted_certificates, extracted_courses = extract('https://www.coursera.org/certificates/advance-your-career#professional-certificates')
log("Extract phase ended")
after = datetime.now()

print(after-before)

log("Transform phase started")
transformed_certificates, transformed_courses = transform(extracted_certificates, extracted_courses)
log("Transform phase ended")

log("Load phase started")
load(transformed_certificates, "professional_certificates.csv")
load(transformed_courses, "courses.csv")
log("load phase ended")

0:01:30.082502
