# Get the Data

### Import dependencies

In [79]:
from bs4 import BeautifulSoup
import requests
import re
import pdb

### Utility Functions

In [2]:
def parse_tag(page, name, class_name):
    tag = page.find(name, class_ = class_name)
    return(tag.get_text() if tag else None)

In [3]:
def get_first_match(pattern, string):
    parsed = re.findall(pattern, string) if string else None
    return(parsed[0] if parsed else None)

In [76]:
def crawl_course_page(url, output_dict):
    course_page = BeautifulSoup(requests.get(url).text, "lxml") 
    course_title = parse_tag(course_page, 'h2', 'course-title') 
    about = parse_tag(course_page, 'p', 'course-description')
    created_by = get_first_match('.\xa0.\xa0(.+)', parse_tag(course_page, 'div', 'creator-names'))    
    instructors_elements = course_page.find_all('div', class_ = 'instructor-info')
    instructors = []
    for instructor_element in instructors_elements:
        name = parse_tag(instructor_element, 'a', None)
        position = get_first_match('.+, (.+)', parse_tag(instructor_element, 'span', None))
        department = parse_tag(instructor_element, 'div', 'instructor-bio')
        instructors.append({'name':name,'position':position,'department':department})
    basic_info_element = course_page.find('table', class_ = 'basic-info-table')
    basic_info_element = basic_info_element.find_all('tr') if basic_info_element else []
    basic_info = {}
    for row in basic_info_element:
        header = parse_tag(row, 'span', 'td-title')
        header_val = parse_tag(row, 'td', 'td-data')
        if header and header_val:
            if header == 'User Ratings':
                header = 'Average User Ratings'
                header_val = get_first_match('([0-9]\.[0-9])', header_val)
            basic_info[header] = header_val
    output_dict[course_title] = {'about': about, 'created_by': created_by, 'instructors': instructors, 'basic_info': basic_info}    

In [77]:
def crawl_page(base_url, search_endpoint, search_term, page, output_dict):
    page_results = BeautifulSoup(requests.get(base_url + search_endpoint + search_term + page).text, "lxml")
    for course in page_results.find_all('a', class_ = 'rc-OfferingCard'):
        course_info_link = course.get('href')
        if not re.search(pattern='specialization', string=course_info_link):
            course_endpoint = get_first_match('(/learn/\S+)', course_info_link)
            if course_endpoint:
                crawl_course_page(base_url + course_endpoint, output_dict)            

In [78]:
def crawl_courses(base_url, search_endpoint, search_term, num_pages, output_dict):
    num_steps = list(range(0, 20*num_pages, 20))
    pages = ['' if step == 0 else '&start=' + str(step) for step in num_steps]
    for page in pages:
        crawl_page(base_url, search_endpoint, search_term, page, output_dict)

### Crawl the Course Pages (1 page)

In [73]:
base = "https://www.coursera.org"
endpoint = '/courses?languages=en&query='
search_term = "Software+Engineering"

In [74]:
courses = {}
crawl_courses(base, endpoint, search_term, 1, courses)