# WebScraping using Selenium and BeautifulSoup

https://colab.research.google.com/drive/1B-cStyriTOmnb8g1QD8eqmmaQATxIoPo

In [1]:
#Importing all important libraries for automation and scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv

In [2]:
# Function to scrape content using BeautifulSoup
def scrape_content(driver, page_link):
    link1 = page_link
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    topic_class = soup.find('h1', class_='article-title')
    
    # Extract the article title text
    topic = topic_class.text.strip()
    
    # Extract the year text
    year_span = soup.find('span', class_='content-utility-curriculum')
    if year_span is not None:
        year_text = year_span.get_text(strip=True)
    else:
        year_text = "Doesn't Exist"

    # Extract the level text
    level_span = soup.find('span', class_='content-utility-level')
    # Find the span with class "content-utility-topic" within the level_span
    if level_span is not None:
        level_text = level_span.find('span', class_='content-utility-topic').text.strip()
    else:
        level_text = "Doesn't Exist"

    # Find the introduction paragraphs
    introduction_section = soup.find('h2', class_='article-section', text=['Introduction', 'Overview'])

    # Find all paragraphs within the Introduction section
    if introduction_section is not None:
        intro_paragraphs = introduction_section.find_next_siblings('p')
        # Extract the text from the paragraphs
        paragraphs = ''
        for p in intro_paragraphs:
            # Check if the paragraph is within the Example section
            if p.find_parents('figure', class_='example'):
                break
            # Append text from paragraph
            paragraphs += p.get_text(strip=True) + ' '
    else: 
        paragraphs = "Doesn't Exist"


    # Find all <li> elements within the <ol> element to extract Learning Outcomes text
    learning_outcomes_section = soup.find('h2', class_='article-section', text='Learning Outcomes')
    if learning_outcomes_section is not None:
        outcomes_section= learning_outcomes_section.find_next_sibling()
        bullet_points = [li.get_text(strip=True) for li in outcomes_section.find_all(['li'])] 
        if bullet_points is None:
            bullet_points = [li.get_text(strip=True) for li in outcomes_section.find_all(['p'])]
        
        bullet = '\n'.join(bullet_points)
    else:
        bullet="Doesn't Exist"

    # Find the <a> tag for Full PDF link
    link_tag = soup.find('a', class_='locked-content')
    # Extract the link
    if link_tag is not None:
        link = link_tag['href']
        full_link = "https://www.cfainstitute.org" + link
    else:
        full_link = "Doesn't Exist"
        
    # Store all extracted data in list format
    data = [topic, year_text, level_text, paragraphs, bullet, full_link, link1]
    
    return data

In [3]:
# Function to click CoveoLink and return to main page
def process_coveo_link(driver, link,csv_writer):
    driver.execute_script("window.open('{}', '_blank');".format(link))
    
    # Switch to new tab if opened
    if len(driver.window_handles) > 1:
        driver.switch_to.window(driver.window_handles[1])
    
    # Scraping content
    content = scrape_content(driver, link)
    csv_writer.writerow(content)
    
    # Closing the tab and switching back to main page
    driver.close()
    driver.switch_to.window(driver.window_handles[0])


In [4]:
# Function to change pagination using CoveoPager
def change_pagination(driver, page_number):
    pager = driver.find_element_by_css_selector('CoveoPager')
    pager.find_element_by_link_text(str(page_number)).click()


In [5]:
def main():

    driver = webdriver.Chrome()  # Optional argument, if not specified will search path.
    time.sleep(5)
    with open("content.csv", "w", newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
    
    for offset in [0,100,200]:
        main_frame = f'https://www.cfainstitute.org/membership/professional-development/refresher-readings#first={offset}&sort=@refreadingcurriculumyeardescending&numberOfResults=100'
        driver.get(main_frame)
        time.sleep(5)
    
        # Wait for CoveoLinks to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "CoveoResultLink")))
        
        # Find all CoveoLinks
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        links = soup.find_all('a', class_='CoveoResultLink')
        
        # Extract the 'href' attribute from each link
        coveo_links = [link['href'] for link in links]
        print(len(links))     # To check no. of links extracted from the page
        for link in coveo_links:
            with open("content.csv", "a", newline='', encoding='utf-8') as csvfile:
                csv_writer = csv.writer(csvfile)
                process_coveo_link(driver, link, csv_writer)
                
                # Wait for some time to simulate human-like behavior
                time.sleep(2)

    driver.quit()

In [6]:


 
# Main functionx


if __name__ == "__main__":
    main()

100
Time-Series Analysis
Credit Analysis Models
Introduction to Alternative Investments
Credit Default Swaps
Valuation of Contingent Claims
Introduction to Commodities and Commodity Derivatives
Understanding Income Statements
Pricing and Valuation of Forward Commitments
Private Equity Investments
Valuation and Analysis of Bonds with Embedded Options
Private Company Valuation
Market-Based Valuation: Price and Enterprise Value Multiples
Machine Learning
Equity Valuation: Applications and Processes
Employee Compensation: Post-Employment and Share-Based
Economic Growth
Multinational Operations
Intercorporate Investments
Evaluating Quality of Financial Reports
Discounted Dividend Valuation
Residual Income Valuation
Free Cash Flow Valuation
Currency Exchange Rates: Understanding Equilibrium Value
Economics of Regulation
Integration of Financial Statement Analysis Techniques
The Term Structure and Interest Rate Dynamics
Environmental, Social, and Governance (ESG) Considerations in Investment 

Corporate Governance: Conflicts, Mechanisms, Risks, and Benefits
Fixed-Income Cash Flows and Types
Private Capital, Real Estate, Infrastructure, Natural Resources, and Hedge Funds
Extensions of Multiple Regression
Pricing and Valuation of Forward Contracts and for an Underlying with Varying Maturities​
Option Replication Using Put-Call Parity​
