In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

# Configure Chrome options to load content as soon as possible
chrome_options = Options()
chrome_options.add_argument('--headless')  # Run in headless mode - Commented out to see the browser in action
chrome_options.add_argument('--disable-gpu')  # Disable GPU rendering
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')  # Prevent detection as automation

# Initialize the Selenium WebDriver with Chrome options
driver = webdriver.Chrome(options=chrome_options)  
# driver.get("https://www.coursera.org/search?query=machine%20learning&language=English&productDifficultyLevel=Intermediate&productDifficultyLevel=Mixed&productDifficultyLevel=Advanced&sortBy=BEST_MATCH")
driver.get("https://www.coursera.org/courses?query=artificial%20intelligence&page=1")
# driver.get("https://www.coursera.org/search?query=machine%20learning&language=English&productDifficultyLevel=Advanced&productDifficultyLevel=Intermediate&productDifficultyLevel=Mixed&topic=Math%20and%20Logic&sortBy=BEST_MATCH")

# Gradual scroll to the bottom to load content progressively
scroll_pause_time = .2  # Time to wait for content to load after each scroll
scroll_increment = 500  # Amount of pixels to scroll down each time

last_height = driver.execute_script("return document.body.scrollHeight")
current_position = 0

# Scroll gradually to the bottom of the page
while True:
    # Scroll by the increment
    current_position += scroll_increment
    driver.execute_script(f"window.scrollTo(0, {current_position});")
    time.sleep(scroll_pause_time)  # Wait for new content to load

    # Get the new height after scrolling
    new_height = driver.execute_script("return document.body.scrollHeight")

    # Stop if we reached the bottom
    if current_position >= new_height:
        break

    last_height = new_height

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Use the CSS selector to find all products
# products = soup.select("#search-main > div.css-q9p10k > div > div > div > div > div.cds-9.css-1xdhyk6.cds-11.cds-grid-item.cds-56.cds-81 > ul:nth-child(2) > li > div > div > div > div > div > div.cds-ProductCard-content")
products = soup.select("#main > div.css-q9p10k > div > div > div:nth-child(2) > div > div.cds-9.css-1xdhyk6.cds-11.cds-grid-item.cds-56.cds-81 > div > div > ul > li:nth-child(2) > div > div")

# Create an empty list to store product information
data = []

# Extract specific text elements within each product card
if products:
    for product in products:
        product_info = {}

        # Extract the partner info (dynamic ID handling)
        partner_info = product.select_one("[id^='cds-react-aria-'][id$='-product-card-partner-info'] p")
        product_info['Partner_Info'] = partner_info.get_text(strip=True) if partner_info else 'N/A'

        # Extract the product card title (dynamic ID handling)
        product_title = product.select_one("[id^='cds-react-aria-'][id$='-product-card-title']")
        product_info['Product_Title'] = product_title.get_text(strip=True) if product_title else 'N/A'

        # Extract the product card body
        product_body = product.select_one("div.cds-ProductCard-body")
        product_info['Product_Body'] = product_body.get_text(strip=True) if product_body else 'N/A'

        # Extract the rating meter (dynamic ID handling)
        rating_meter = product.select_one("[id^='cds-react-aria-'][id$='-meter'] span")
        product_info['Rating_Meter'] = rating_meter.get_text(strip=True) if rating_meter else 'N/A'

        # Extract other specific information (dynamic ID handling)
        additional_info = product.select_one("[id^='cds-react-aria-'] > div.css-vac8rf")
        if additional_info:
            additional_info_text = additional_info.get_text(strip=True)
            # Multiply values with 'K' by 1000
            additional_info_text = re.sub(r'(\d+(\.\d+)?)K', lambda x: str(int(float(x.group(1)) * 1000)), additional_info_text)
            product_info['Additional_Info'] = additional_info_text
        else:
            product_info['Additional_Info'] = 'N/A'

        # Extract metadata from product footer and split into separate columns
        product_footer = product.select_one("div.cds-ProductCard-footer div.cds-CommonCard-metadata > p")
        if product_footer:
            footer_text = product_footer.get_text(strip=True)
            footer_parts = [part.strip() for part in footer_text.split('\u00b7')]
            product_info['Difficulty_Level'] = footer_parts[0] if len(footer_parts) > 0 else 'N/A'
            product_info['Course_Type'] = footer_parts[1] if len(footer_parts) > 1 else 'N/A'
            duration_text = footer_parts[2] if len(footer_parts) > 2 else 'N/A'
            # Convert duration to weeks
            if 'Months' in duration_text:
                duration_range = re.findall(r'\d+', duration_text)
                if len(duration_range) == 2:
                    duration_in_weeks = f"{int(duration_range[0]) * 4} - {int(duration_range[1]) * 4} Weeks"
                elif len(duration_range) == 1:
                    duration_in_weeks = f"{int(duration_range[0]) * 4} Weeks"
                else:
                    duration_in_weeks = 'N/A'
            elif 'Weeks' in duration_text:
                duration_in_weeks = duration_text
            else:
                duration_in_weeks = 'N/A'
            product_info['Duration'] = duration_in_weeks
        else:
            product_info['Difficulty_Level'] = 'N/A'
            product_info['Course_Type'] = 'N/A'
            product_info['Duration'] = 'N/A'

        data.append(product_info)

    # Create a DataFrame from the data list
    df = pd.DataFrame(data)

    # Print the DataFrame
    print(df)

    # Write the DataFrame to a CSV file
    df.to_csv('coursera_courses.csv', index=False)
else:
    print("No products found. Please check the page or the CSS selector.")

# Close the browser
driver.quit()

No products found. Please check the page or the CSS selector.
