In [3]:
import requests
import lxml.html as lx
import time
import re
import pandas as pd

## Function to gather the course links using Scrapping

In [146]:
coursera_base_url = 'https://www.coursera.org'

coursera_course_id = 100000

In [330]:
def generate_course_id():
    
    global coursera_course_id
    new_id = coursera_course_id
    coursera_course_id += 1
    
    return new_id    



def get_course_links( url, params ):
    
    url = url + '/courses'
    res = get_http_response( url, params )
    print(res.request.url)
    try:
        html = lx.fromstring(res.text)

        course_links = html.xpath("//li//a[contains(@class, 'cds-CommonCard-titleLink')]")
        course_links = get_url_and_reviews(course_links)
        
    except Exception as e:
        print("No courses found: ")
        print(e)
        return []
    else:
        return course_links
    
    
    
def get_url_and_reviews( locators ):
    
    all_link_details = []
    
    for locator in locators:
        details = {}
        
        details['url'] = coursera_base_url + locator.get("href")
        
        footer = locator.xpath("ancestor::div[@class='cds-ProductCard-header']/following-sibling::div[@class='cds-ProductCard-footer']/div[@class='cds-CommonCard-ratings']//text()")
        
        details['rating'] = footer
        
        all_link_details.append(details)
        
    return all_link_details

    

def get_all_course_links( url ):
    
    all_course_links = []
    
    params = {
        'query': 'web development',
        'productTypeDescription': 'Courses',
        'page': 1
    }
    
    res = get_http_response( url + '/courses', params )
    
    try:
        html = lx.fromstring(res.text)

        course_links = html.xpath("//li//a[contains(@class, 'cds-CommonCard-titleLink')]")
        course_links = get_url_and_reviews(course_links)
        
        num_pages = int(html.xpath("(//div[contains(@class, 'pagination-controls-container')]//a)[last()-1]/text()")[0])
        
    except Exception as e:
        print(e)
    else:
        all_course_links += course_links
    
    for i in range(2, num_pages+1):
        params['page'] = i
        
        page_course_links = get_course_links(url, params)
        
        all_course_links += page_course_links
        
    return all_course_links


        

In [None]:
params = {
    'query': 'web development',
    'productTypeDescription': 'Courses',
    'page': 1
}

In [183]:
links = get_course_links( coursera_base_url, params )

In [332]:
links

[{'url': 'https://www.coursera.org/learn/html-css-javascript-for-web-developers',
  'rating': ['4.7', '(16K reviews)']},
 {'url': 'https://www.coursera.org/learn/introduction-to-web-development-with-html-css-javacript',
  'rating': ['4.6', '(1.9K reviews)']},
 {'url': 'https://www.coursera.org/learn/web-development',
  'rating': ['4.7', '(3.6K reviews)']},
 {'url': 'https://www.coursera.org/learn/introduction-to-front-end-development',
  'rating': ['4.8', '(10K reviews)']},
 {'url': 'https://www.coursera.org/learn/python-for-applied-data-science-ai',
  'rating': ['4.6', '(35K reviews)']},
 {'url': 'https://www.coursera.org/learn/programming-with-javascript',
  'rating': ['4.7', '(3.7K reviews)']},
 {'url': 'https://www.coursera.org/learn/getting-started-with-front-end-and-web-development',
  'rating': ['4.5', '(88 reviews)']},
 {'url': 'https://www.coursera.org/learn/developing-frontend-apps-with-react',
  'rating': ['4.3', '(363 reviews)']},
 {'url': 'https://www.coursera.org/learn/ht

In [333]:
links = get_all_course_links(coursera_base_url)

https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=2
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=3
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=4
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=5
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=6
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=7
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=8
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=9
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=10
https://www.coursera.org/courses?query=web+development&productTypeDescription=Courses&page=11
https://www.coursera.org/courses?query=web+development&productTypeDe

## Functions to gather all the details for courses using the URL and convert it into consistent format(i.e. Mapping Levels to index, Duration to Hours)

In [384]:

def get_coursera_course_info( course ):
    
    res = get_http_response( course['url'] )
    
    details = {}

    html = lx.fromstring(res.text)

    course_title = html.xpath("//h1/text()")[0]
    details['course_title'] = course_title

    details['course_id'] = 'ce_' + str(generate_course_id())

    details['course_url'] = course['url']

    try:
        instructor = html.xpath("//div/a[starts-with(@href, '/instructor')]/span/text()")[0]
    except Exception as e:
        print(course['url'] + ': --> ')
        print(e)
        details['course_instructor'] = ''
    else:
        details['course_instructor'] = instructor

    if len(course['rating']) > 0:
        details['course_rating'] = float(course['rating'][0])
        
        num_reviews = course['rating'][1].split()[0][1:]
        
        idx = num_reviews.find('K')
        
        multiplier = 1
        
        if idx >= 0:
            num_reviews = num_reviews[0:idx]
            multiplier = 1000
        
        details['course_no_of_reviews'] = int(multiplier * float(num_reviews))
    else:
        details['course_rating'] = None
        details['course_no_of_reviews'] = None

    duration_div_texts = html.xpath("//section//div[@class = 'cds-119 cds-Typography-base css-h1jogs cds-121']/text()")
    duration_str = 0
    for text in duration_div_texts:
        if 'hours' in text.lower():
            matches = re.findall("\d+(?= hours)", text)
            if len(matches) > 0:
                duration_str = matches[0]
            break

    details['course_duration'] = float(duration_str)

    course_level = 0
    for text in duration_div_texts:
        if 'level' in text.lower():
            match = re.findall("\w+(?= level)", text)
    
            if len(match) > 0:
                match = match[0].lower()

                if match == 'beginner':
                    course_level = 1
                elif match == 'intermediate':
                    course_level = 2
                elif match == 'advanced':
                    course_level = 3
                    break

   

    details['course_level'] = course_level

    num_enrolled = html.xpath("//div/p/span/strong/span/text()")
    
    if len(num_enrolled) > 0:
        try:
            num_enrolled = int(num_enrolled[0].replace(',', ''))
        except ValueError as e:
            print(course['url'] + ': --> ')
            print(e)
            details['course_no_of_enrolled'] = 0
        else:
            details['course_no_of_enrolled'] = num_enrolled
    else:
        details['course_no_of_enrolled'] = 0

    desc_loc = html.xpath("//div[@id='about']")
    desc = []
    if len(desc_loc) > 0:
        desc_loc = desc_loc[0]
        desc = desc_loc.xpath("descendant::div//ul//p//text()|descendant::div//ul//a/text()")
    
    desc += html.xpath("//div[@id='modules']//span/h3/text()")
    
    details['course_details'] = " ".join(desc).replace('\xa0', ' ')
    
    return details
    
    

In [387]:
res = get_coursera_course_info(links[91])

In [386]:
all_course_data = [ get_coursera_course_info(course) for course in links ]

https://www.coursera.org/learn/software-architecture-for-big-data-applications: --> 
invalid literal for int() with base 10: ' You may not earn credit for more than one version of a cross-listed course.'
https://www.coursera.org/learn/data-governance-databricks: --> 
invalid literal for int() with base 10: ' '
https://www.coursera.org/learn/programming-with-javascript-es: --> 
invalid literal for int() with base 10: 'None!'
https://www.coursera.org/learn/introduction-to-version-control-es: --> 
invalid literal for int() with base 10: 'None!'
https://www.coursera.org/learn/introduction-to-back-end-development-es: --> 
invalid literal for int() with base 10: 'None!'
https://www.coursera.org/learn/html-and-css-in-depth-es: --> 
invalid literal for int() with base 10: 'None!'
https://www.coursera.org/learn/react-basics-es: --> 
invalid literal for int() with base 10: 'None!'
https://www.coursera.org/learn/meta-programming-fundamentals-kotlin-es: --> 
invalid literal for int() with base 10:

In [388]:
coursera_df = pd.DataFrame(all_course_data)

In [389]:
coursera_df.head()

Unnamed: 0,course_title,course_id,course_url,course_instructor,course_rating,course_no_of_reviews,course_duration,course_level,course_no_of_enrolled,course_details
0,"HTML, CSS, and Javascript for Web Developers",ce_101040,https://www.coursera.org/learn/html-css-javasc...,Yaakov Chaikin,4.7,16000.0,40.0,0,1088947,HTML JavaScript Css Frameworks Cascading Style...
1,"Introduction to Web Development with HTML, CSS...",ce_101041,https://www.coursera.org/learn/introduction-to...,Upkar Lidder,4.6,1900.0,12.0,1,133525,Describe the Web Application Development Ecosy...
2,Introduction to Web Development,ce_101042,https://www.coursera.org/learn/web-development,William Mead,4.7,3600.0,10.0,1,235762,Css Code HTML Web Development Cascading Style ...
3,Introduction to Front-End Development,ce_101043,https://www.coursera.org/learn/introduction-to...,Taught by Meta Staff,4.8,10000.0,18.0,1,302618,"Distinguish between front-end, back-end, and f..."
4,"Python for Data Science, AI & Development",ce_101044,https://www.coursera.org/learn/python-for-appl...,Joseph Santarcangelo,4.6,35000.0,26.0,1,776135,Learn Python - the most popular programming la...


In [390]:
coursera_df.to_csv('coursera_course_all_info')