In [1]:
from __future__ import print_function, division
%pylab notebook
import sys
import bs4 as bs
import time
import pandas as pd
import dryscrape
import time
import json
from tqdm import tqdm

if 'linux' in sys.platform:
    # start xvfb in case no X is running. Make sure xvfb 
    # is installed, otherwise this won't work!
    dryscrape.start_xvfb()

Populating the interactive namespace from numpy and matplotlib


In [2]:
def get_page_session(url, wait=2):
    sess = dryscrape.Session()
    sess.set_viewport_size(width=1024, height=20000)
    sess.visit(url)
    time.sleep(30)
    
    name = url.rsplit('/', 1)[-1]    
    
    sess.render('data/coursera/specializations/' + name + '.png')
    
    return sess

def click_buttons(sess, xpath):
    for button in sess.xpath(xpath):
        #print('click button')
        try:
            button.click()
        except:
            #print('\tdone')
            break    

def get_specialization_df(url):
    sess = get_page_session(url)
    
    #expand all syllabus details
    click_buttons(sess, "//div[contains(@class, 'course-show-syllabus-text')]")
    
    time.sleep(5)
    soup = bs.BeautifulSoup(sess.body(), "lxml")
    courses = soup.find_all('div', attrs={'class': 'rc-SingleCourse'})
    data = list()
    titles = set()
    for idx, c in enumerate(courses):
        title = c.find('div', attrs={'class': 'course-title'}).getText(separator=u' ')
        if title in titles:
            continue
        else:
            titles.add(title)
        print(title)
        about = c.find('div', attrs={'class': 'course-about'}).getText(separator=u' ')
        try:
            syllabus = c.find('div', attrs={'class': 'rc-Syllabus'}).getText(separator=u' ')
        except:
            syllabus = ''
        data.append((idx, title, about, syllabus))
    sess.reset()
    return pd.DataFrame(columns=['course_num', 'title', 'about', 'syllabus'], data=data)

In [3]:
def get_specialization_links(url):
    sess = get_page_session(url)
    spec_pages = list()
    current_idx = 0
    while True:
        for exp_idx, expand_button in enumerate(sess.xpath("//button[contains(@class, 'primary see-all-button')]")):
            if current_idx == exp_idx:
                #print('click')
                expand_button.click()
                time.sleep(2)
                spec_pages.append(sess.body())
                current_idx += 1
                sess = get_page_session(url)
                break
        else:
            break
            
    spec_urls = set()
    for sp in spec_pages:
        sp = bs.BeautifulSoup(sp, "lxml")
        js_obj = json.loads(sp.find('script', attrs={'type': 'application/ld+json'}).getText())
        for item in  js_obj['itemListElement']:
            url = item['url']
            if url.startswith('https://www.coursera.org/specializations/'):
                spec_urls.add(url)
    return spec_urls

In [4]:
spec_links = get_specialization_links('https://www.coursera.org/browse/data-science')

print("LINKS\n" + "\n".join(spec_links))

LINKS
https://www.coursera.org/specializations/statistics
https://www.coursera.org/specializations/excel-mysql
https://www.coursera.org/specializations/robotics
https://www.coursera.org/specializations/social-science
https://www.coursera.org/specializations/bigdata
https://www.coursera.org/specializations/business-analytics
https://www.coursera.org/specializations/machine-learning
https://www.coursera.org/specializations/jhu-data-science


In [None]:
courses = set()
for l in tqdm(spec_links):
    sys.stdout.flush()
    name = l.rsplit('/', 1)[-1]    
    print('process', name)
    print('-' * 80)
    df = get_specialization_df(l)
    print(df.head(2))
    df.to_pickle('data/coursera/specializations/' + name + '.df')
    df.to_csv('data/coursera/specializations/' + name + '.csv', encoding='utf-8', index=False)
    courses.update(set(df['title']))
    print('=' * 80)

  0%|          | 0/8 [00:00<?, ?it/s]

process statistics
--------------------------------------------------------------------------------


 12%|█▎        | 1/8 [00:46<05:25, 46.45s/it]

Empty DataFrame
Columns: [course_num, title, about, syllabus]
Index: []
process excel-mysql
--------------------------------------------------------------------------------


 25%|██▌       | 2/8 [01:24<04:22, 43.81s/it]

Empty DataFrame
Columns: [course_num, title, about, syllabus]
Index: []
process robotics
--------------------------------------------------------------------------------


 38%|███▊      | 3/8 [02:08<03:40, 44.10s/it]

Robotics: Aerial Robotics
Robotics: Computational Motion Planning
Robotics: Mobility
Robotics: Perception
Robotics: Estimation and Learning
Robotics: Capstone
   course_num                                    title  \
0           0                Robotics: Aerial Robotics   
1           1  Robotics: Computational Motion Planning   

                                               about  \
0  How can we create agile micro aerial vehicles ...   
1  Robotic systems typically include three compon...   

                                            syllabus  
0  WEEK 1 Introduction to Aerial Robotics Welcome...  
1  WEEK 1 Introduction and Graph-based Plan Metho...  
process social-science
--------------------------------------------------------------------------------


 50%|█████     | 4/8 [02:49<02:52, 43.02s/it]

Quantitative Methods
Qualitative Research Methods
Basic Statistics
Inferential Statistics
Methods and Statistics in Social Science - Final Research Project
   course_num                         title  \
0           0          Quantitative Methods   
1           1  Qualitative Research Methods   

                                               about  \
0  Discover the principles of solid scientific me...   
1  In this course you will be introduced to the b...   

                                            syllabus  
0  WEEK 1 Before we get started... In this first ...  
1  WEEK 1 Philosophy of Qualitative Research Welc...  
process bigdata
--------------------------------------------------------------------------------


 62%|██████▎   | 5/8 [03:33<02:09, 43.29s/it]

Introduction to Big Data
Big Data Modeling and Management Systems
Big Data Integration and Processing
Machine Learning With Big Data
Graph Analytics for Big Data
Big Data - Capstone Project
   course_num                                     title  \
0           0                  Introduction to Big Data   
1           1  Big Data Modeling and Management Systems   

                                               about  \
0  Interested in increasing your knowledge of the...   
1  Once you’ve identified a big data issue to ana...   

                                            syllabus  
0  WEEK 1 Welcome  Welcome to the Big Data Specia...  
1  WEEK 1 Introduction to Big Data Modeling and M...  
process business-analytics
--------------------------------------------------------------------------------


 75%|███████▌  | 6/8 [04:12<01:24, 42.09s/it]

Customer Analytics
Operations Analytics
People Analytics
Accounting Analytics
Business Analytics Capstone
   course_num                 title  \
0           0    Customer Analytics   
1           1  Operations Analytics   

                                               about  \
0  Data about our browsing and buying patterns ar...   
1  This course is designed to impact the way you ...   

                                            syllabus  
0  WEEK 1 Introduction to Customer Analytics What...  
1  WEEK 1 Introduction, Descriptive and Predictiv...  
process machine-learning
--------------------------------------------------------------------------------


In [None]:
print('num specializations:', len(spec_links))
print('overall courses:', len(courses))