In [1]:
# Native imports
import sys
import re
import os
import os.path
import json
from pprint import pprint
import urllib.request
import urllib.parse

# 3rd-party scraping/parsing imports
import requests
from bs4 import BeautifulSoup
import dateutil.parser as dt
import demjson

# 3rd-party data science imports
import pandas as pd
import seaborn as sns

# Used to make the plots bigger
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8, 8)

In [2]:
def fetch_html_page(url):
    return requests.get(url).text.strip()

In [3]:
UNDERGRAD_PROGRAMS_URL = 'http://guide.berkeley.edu/undergraduate/degree-programs/'
GRAD_PROGRAMS_URL = 'http://guide.berkeley.edu/graduate/degree-programs/'

## Undergraduate Programs

In [4]:
html = fetch_html_page(UNDERGRAD_PROGRAMS_URL)
soup = BeautifulSoup(html, 'html.parser')

In [5]:
def fetch_all_programs(soup):
    for element in soup.find_all('script'):
        if len(element.contents) > 0 and 'allProgData' in element.contents[0]:
            json_str = element.contents[0].split(' = ')[1]
            json_str = json_str.strip()[:-1]

            raw_program_data = demjson.decode(json_str)
            processed_program_data = list(raw_program_data.values())
            for program in processed_program_data:
                del program['id']
                del program['url']
            
            return processed_program_data
        
def fetch_filter_info(soup, debug=False):
    filter_info = {}
    
    for filter_group in soup.find_all('div', class_='filter-group'):
        filter_cat = filter_group.previous_sibling.get_text()
        
        if debug:
            print('Filter Category:', filter_cat)
        
        filter_info[filter_cat] = {}
        
        for filter_ in filter_group.find_all('label'):
            filter_internal_name = filter_.attrs['for']
            filter_real_name = filter_.get_text().strip()
            
            if debug:
                print(f'  {filter_real_name} ({filter_internal_name})')
            
            filter_info[filter_cat][filter_internal_name] = filter_real_name
    
    return filter_info

def synthesize_program_dataset_undergrad(programs, filters, keep_major_minor_only=True):
    program_dataset = {}
    
    selected_filter_cat = list(filters.values())[0]
    
    for program in programs:
        for (filter_key, filter_name) in selected_filter_cat.items():
            filter_name = filter_name.lower()
            
            if filter_key in program['filters']:
                if keep_major_minor_only and filter_name not in ['majors', 'minors']:
                    continue
                
                if filter_name not in program_dataset:
                    program_dataset[filter_name] = []
                
                program_dataset[filter_name] += [program['name']]
    
    return program_dataset

all_programs = fetch_all_programs(soup)
all_filters = fetch_filter_info(soup)
final_dataset = synthesize_program_dataset_undergrad(all_programs, all_filters)

In [6]:
majors_list_mongo = [{'_id': i, 'major': major} for (i, major) in enumerate(final_dataset['majors'])]
minors_list_mongo = [{'_id': i, 'minor': minor} for (i, minor) in enumerate(final_dataset['minors'])]

In [9]:
with open('exported/majors.json', 'w') as fp:
    json.dump(majors_list_mongo, fp)
    
with open('exported/minors.json', 'w') as fp:
    json.dump(minors_list_mongo, fp)