# Coursera Scraper

Scraping the course details from Coursera

In [9]:
# Import libraries
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  # to autonomically open the website
import time
import re
from bs4 import BeautifulSoup as soup
import csv

In [64]:
# Get the url for coursera with range of pages to be crawled
# Since Coursera can only show 1000 courses, we seperated the urls depends on levels.
urls_beginner = [
    'https://www.coursera.org/courses?query=&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Beginner&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&indices%5Bprod_all_products%5D%5Bpage%5D='
    + str(page) + '&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true'
    for page in range(1, 101, 1)
]
urls_mixed = [
    'https://www.coursera.org/courses?query=&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Mixed&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&indices%5Bprod_all_products%5D%5Bpage%5D='
    + str(page) + '&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true'
    for page in range(1, 92, 1)
]
urls_intermediate = [
    'https://www.coursera.org/courses?query=&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Intermediate&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&indices%5Bprod_all_products%5D%5Bpage%5D='
    + str(page) + '&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true'
    for page in range(1, 64, 1)
]
urls_advanced = [
    'https://www.coursera.org/courses?query=&indices%5Bprod_all_products%5D%5BrefinementList%5D%5Blanguage%5D%5B0%5D=English&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BproductDifficultyLevel%5D%5B0%5D=Advanced&indices%5Bprod_all_products%5D%5BrefinementList%5D%5BentityTypeDescription%5D%5B0%5D=Courses&indices%5Bprod_all_products%5D%5Bpage%5D='
    + str(page) + '&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BclickAnalytics%5D=true&indices%5Bprod_all_products%5D%5Bconfigure%5D%5BhitsPerPage%5D=10&configure%5BclickAnalytics%5D=true'
    for page in range(1, 12, 1)
]

In [54]:
def scrap_coursera(csvfile='coursera.csv', urls=urls):
    """
    Scrap all coursera courses.
    Save the result in csvfile.
    """
    with open(csvfile, 'w') as coursera:
        course_writer = csv.writer(coursera, delimiter=',')
        course_writer.writerow([
            'course_type', 'course_title', 'course_rating', 'course_link',
            'course_skill', 'course_job'
        ])  # Generate the header of csvfile.

        # loop through all the pages
        for url in urls:
            page_content = get_site_file(url)

            # Navigate to list of courses
            # Parse Data
            courses = page_content.find_all('li',
                                            {'class': 'ais-InfiniteHits-item'})
            for course in courses:
                try:
                    course_tag = course.a.get('href')
                    course_type = get_course_type(course_tag)
                    course_title = course.h2.get_text()
                    course_rating = course.find('span', {
                        'class': 'ratings-text'
                    }).get_text()
                    course_link = 'https://www.coursera.org%s' % course_tag
                    # Base course_link to get more course infromation
                    page_content = get_site_file(url=course_link)
                    course_skill = get_course_skill(page_content=page_content)
                    course_job = get_course_job(page_content)
                    course_job = retry_course_job(times=3,
                                                  course_job=course_job)
                    # Write into the file
                    course_writer.writerow([
                        course_type, course_title, course_rating, course_link,
                        course_skill, course_job
                    ])
                except AttributeError as e:
                    print(e)

In [55]:
def get_site_file(url):
    """
    url - base url to access desired web file
    """
    try:
        driver = webdriver.Firefox(
            executable_path=r'/home/xinda/insight/notebooks/geckodriver')
        driver.get(url)
        time.sleep(2)  # 2 sec was set to wait the response from Coursera.
        html = driver.page_source
        page_content = soup(html, 'html.parser')
        driver.close()  # Closed the browser opened in each loop.
        return page_content
    except HTTPError as error:
        print(error)


def get_course_type(course_tag):
    """
    Check course type if it is 'Course'. Only 'Course' is interested in.
    """
    if course_tag.startswith(r"/l"):
        course_type = 'Course'
    elif course_tag.startswith(r"/s"):
        course_type = 'specializations'
    else:
        course_type = 'professional-certificates'
    return course_type


def get_course_job(page_content):
    """
    Get the occupations who are interested in this course.
    """
    course_job = []
    course_job_temp = page_content.find_all('li', {'class': 'occupation-name'})
    for link in course_job_temp:
        course_job.append(link.get_text())
    return course_job


def get_course_skill(page_content):
    """
    Get the skilled are expected to acquire in this course.
    """
    course_skill = []
    course_skill_temp = page_content.find_all(
        'span', {'class': 'Pill_56iw91 m-r-1s m-b-1s'})
    for link in course_skill_temp:
        course_skill.append(link.get_text())
    return course_skill


def retry_course_job(times=3, course_job=course_job):
    """
    Check if course_job is failed to get due to connecting issure.
    """
    for i in range(0, times):
        if course_job == []:
            course_job = get_course_job(page_content)
        else:
            pass
    return course_job

In [65]:
scrap_coursera(csvfile='coursera_biginner.csv', urls=urls_beginner)
scrap_coursera(csvfile='coursera_intermediate.csv', urls=urls_intermediate)
scrap_coursera(csvfile='coursera_mixed.csv', urls=urls_mixed)
scrap_coursera(csvfile='coursera_advanced.csv', urls=urls_advanced)

'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute

In [None]:
# Check the csvfile completeness.
import pandas as pd
df = pd.read_csv('coursera_intermediate.csv')
df.info()