# Scraping Programs to Create the `Description of Program` variable - USA

In [1]:
from scraper import Program

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# initialize variables to store the data

# pgm_descr = pd.DataFrame(columns=['url', 'header', 'content'])
pgm_other = pd.DataFrame(columns=['url', 'header_tag', 'header_names', 'links_useful_area', 'date_published', 'date_last_modified'])

In [3]:
# load the PPL
ppl = pd.read_excel("../data/PPL/PPL_USA.xlsx")
ppl_sub = ppl[['URL', 'Useful area']]
ppl_sub.head()

Unnamed: 0,URL,Useful area
0,https://www.seas.harvard.edu/applied-computati...,#node-11181 > div.grid-container
1,https://ed.stanford.edu/academics/masters/ed-d...,#content-wrapper
2,https://statistics.stanford.edu/academic-progr...,#content-wrapper
3,https://mitsloan.mit.edu/master-of-business-an...,#main-content
4,https://sps.columbia.edu/academics/masters/app...,#block-columbia-sps-content > article > div


In [4]:
# load the data till where it was last updated
#pgm_descr = pd.read_csv('../data/intermediate/program_description_usa.csv')
#pgm_other = pd.read_csv('../data/intermediate/program_other_data_usa.csv')

In [5]:
# change the url, useful_area_path and static variables

problem_rows = []

for row in range(ppl_sub.shape[0]):

    url = ppl_sub['URL'][row]
    uap = ppl_sub['Useful area'][row]

    try:
        p = Program(
            url=url,
            driver_path='../chromedriver_win32/chromedriver.exe',
            useful_area_path=uap,
            static=True
        )

        ex_df = p.extract_df(content=False)
        # concatenate
        pgm_other = pd.concat([pgm_other, ex_df])

    except:
        print(f'Data entry in row {row} needs to be checked')
        problem_rows.append(row)
        continue

Data entry in row 15 needs to be checked


In [6]:
# fixing the issue with row 15 and adding it in

response = requests.get(
   'https://professional.uchicago.edu/find-your-fit/masters/master-science-analytics',
    headers = {'User-agent': 'DSE Scraper'},
    timeout=(10,10),
    verify=False
)

soup = BeautifulSoup(response.content, 'html.parser')
soup_refined = soup.find_all(class_ = 'region-content')[0]

p = Program(
    url='https://professional.uchicago.edu/find-your-fit/masters/master-science-analytics',
    driver_path='../chromedriver_win32/chromedriver.exe',
    useful_area_path='#mm-0 > div.responsive-menu-page-wrapper > section > div.region.region-content',
    static=True
)

t = p.extract_df(content=False, pass_parse=soup_refined)
pgm_other = pd.concat([pgm_other, t])

In [7]:
pgm_other = pgm_other.reset_index(drop=True)
pgm_other.head()

Unnamed: 0,url,header_tag,header_names,links_useful_area,date_published,date_last_modified,tags,tag_freq,emphasized
0,https://www.seas.harvard.edu/applied-computati...,"[h1, h2]","[[Master's in Data Science\n], [Main navigatio...","[/computer-science, https://statistics.fas.har...",2021-01-01,2021-01-01,"[div, div, h1, span, div, div, div, p, span, a...","{'div': 7, 'h1': 1, 'span': 2, 'p': 11, 'a': 3...","[wednesday, october 13, master's in data scien..."
1,https://ed.stanford.edu/academics/masters/ed-d...,"[h2, h1, h3, h4]","[[You are here, Education data science at Stan...","[/, /academics/masters/data-science/program, /...",2020-06-05,2021-01-14,"[div, h2, ul, li, div, a, i, span, li, div, di...","{'div': 210, 'h2': 4, 'ul': 1, 'li': 2, 'a': 2...","[education data science (ms), carl wieman, tho..."
2,https://statistics.stanford.edu/academic-progr...,"[h1, h3, h4]","[[ M.S. in Statistics: Data Science ], [2019-2...",[https://statistics.stanford.edu/academics/ms-...,2019-01-01,2020-01-01,"[div, h1, div, div, div, div, article, span, d...","{'div': 11, 'h1': 1, 'article': 1, 'span': 2, ...","[practical component (3 units) optional, minim..."
3,https://mitsloan.mit.edu/master-of-business-an...,"[h1, h2, h5, h3, h4]","[[Master of Business Analytics], [Curriculum, ...","[https://www.facebook.com/MITSloanAdmissions/,...",2019-12-20,2019-12-20,"[div, div, div, div, div, div, section, figure...","{'div': 246, 'section': 16, 'figure': 43, 'spa...","[master of business analytics, patrick jaillet..."
4,https://sps.columbia.edu/academics/masters/app...,"[h1, h2, h3]","[[Applied Analytics], [Using analytics to make...","[https://apply.sps.columbia.edu/apply/, /acade...",2021-05-21,2021-05-21,"[div, div, section, div, article, span, pictur...","{'div': 81, 'section': 1, 'article': 8, 'span'...","[m.s. application requirements, featured facul..."


In [8]:
pgm_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   url                 81 non-null     object
 1   header_tag          81 non-null     object
 2   header_names        81 non-null     object
 3   links_useful_area   81 non-null     object
 4   date_published      77 non-null     object
 5   date_last_modified  77 non-null     object
 6   tags                81 non-null     object
 7   tag_freq            81 non-null     object
 8   emphasized          81 non-null     object
dtypes: object(9)
memory usage: 5.8+ KB


In [9]:
# fix missing dates
pgm_other = pgm_other.fillna('Not inferred')

In [11]:
# save the intermediate forms of files
#pgm_descr.to_csv('../data/intermediate/program_description_usa.csv', index=False)
pgm_other.to_csv('../data/intermediate/program_other_data_usa.csv', index=False)