# Automating Departments

In [1]:
%run ../src/DepartmentScraper.py
%run ../src/CourseScraper.py

In [15]:
import pandas as pd
from multiprocessing import Pool
from os import listdir

In [5]:
num_cores = 4
base_f = '../data/courses/'

departments = pd.read_csv('../data/departments.csv')
departments.head()

Unnamed: 0,college,department,href
0,College of Liberal Arts and Sciences,College of Liberal Arts and Sciences Non-depar...,https://myui.uiowa.edu/my-ui/courses/by-depart...
1,College of Liberal Arts and Sciences,African American Studies,https://myui.uiowa.edu/my-ui/courses/by-depart...
2,College of Liberal Arts and Sciences,American Indian and Native Studies Program,https://myui.uiowa.edu/my-ui/courses/by-depart...
3,College of Liberal Arts and Sciences,American Studies,https://myui.uiowa.edu/my-ui/courses/by-depart...
4,College of Liberal Arts and Sciences,Anthropology,https://myui.uiowa.edu/my-ui/courses/by-depart...


In [4]:
list_colleges = ['College of Liberal Arts and Sciences', 'Tippie College of Business', 'College of Engineering',
 'University College', 'College of Education']

to_scrape = lambda x: x in list_colleges

colleges_to_scrape = departments[departments['college'].apply(to_scrape) == True]

---

**Example**

In [None]:
department_scraper = DepartmentScraper()

#for i in departments.loc[1]['href']:
    #department_scraper.start_scrape(i)
department_scraper.start_scrape(departments.loc[1]['href'])

In [None]:
course_scraper = CourseScraper(department_scraper.class_pages)

In [None]:
df = course_scraper.courses_df

print (df.columns)
df.head()

In [None]:
f_out = '../data/raw-courses.json'
df.to_json(f_out, orient='records', lines=True)

**For Real**

In [None]:
for college in list_colleges:
    
    print ("Starting {} Departments.".format(college))
    college_deps = departments[departments['college'] == college]
    
    for dep in college_deps.to_records():
        try:        
            dep_name = dep[2].lower().replace('&', '').replace("'", "").replace(' ', '-')
            dep_file = base_f + dep_name + '.json'
            dep_link = dep[3]

            #print (dep_name, dep_link, dep_file)

            dep_scraper = DepartmentScraper()
            print ("\tScraping {} classes".format(dep_name))
            dep_scraper.start_scrape(dep_link)

            print ("\tDone scraping classes. extracting now")
            dep_courses = CourseScraper(dep_scraper.class_pages)
            dep_courses.courses_df.to_json(dep_file, orient='records', lines=True)
        except Exception as e:
            print (e, dep)

In [19]:
for college in list_colleges:
    print ("Starting {} Departments.".format(college))
    college_deps = departments[departments['college'] == college]
    
    college_deps.apply(lambda x: process_department(x), axis=1)

Starting College of Liberal Arts and Sciences Departments.
	Scraping college-of-liberal-arts-and-sciences-non-departmental-courses classes
'NoneType' object has no attribute 'text' college                    College of Liberal Arts and Sciences
department    College of Liberal Arts and Sciences Non-depar...
href          https://myui.uiowa.edu/my-ui/courses/by-depart...
Name: 0, dtype: object
	Skipping african-american-studies
	Skipping american-indian-and-native-studies-program
	Skipping american-studies
	Skipping anthropology
	Skipping arabic-language-and-literature
	Skipping school-of-art-and-art-history
	Skipping asian--slavic-languages-and-literatures
	Skipping american-sign-language
	Skipping aging-and-longevity-studies-program
	Skipping biology
	Skipping critical-cultural-competence-certificate-program
	Skipping cinematic-arts
	Skipping chemistry
	Skipping clas-non-departmental
	Skipping classics
	Skipping communication-studies
	Skipping computer-science
	Skipping creative-writi

'NoneType' object has no attribute 'text' college                                      University College
department                                  AGEP Summer Program
href          https://myui.uiowa.edu/my-ui/courses/by-depart...
Name: 167, dtype: object
	Skipping belin-blank-center-for-gifted-education
	Skipping career-center-programs
	Scraping center-for-diversity-and-enrichment classes
'NoneType' object has no attribute 'text' college                                      University College
department                  Center for Diversity and Enrichment
href          https://myui.uiowa.edu/my-ui/courses/by-depart...
Name: 170, dtype: object
	Skipping college-success-initiatives
	Scraping first-year-programs classes
'NoneType' object has no attribute 'text' college                                      University College
department                                  First-Year Programs
href          https://myui.uiowa.edu/my-ui/courses/by-depart...
Name: 172, dtype: object
	Scraping 

In [17]:
def process_department(dep):
    #dep = departments[departments['department'] == department].to_records()[0]
    
    try:        
        dep_name = dep['department'].lower().replace('&', '').replace("'", "").replace("/", "").replace(' ', '-')
        dep_file = base_f + dep_name + '.json'
        dep_link = dep['href']
        #print (dep_name, dep_link, dep_file)
        
        # sanity check
        if dep_file.split('/')[-1] not in listdir(base_f):
            dep_scraper = DepartmentScraper()
            print ("\tScraping {} classes".format(dep_name))
            dep_scraper.start_scrape(dep_link)

            print ("\tDone scraping classes. extracting now")
            dep_courses = CourseScraper(dep_scraper.class_pages)
            dep_courses.courses_df.to_json(dep_file, orient='records', lines=True)
        else:
            print ("\tSkipping {}".format(dep_name))
    except Exception as e:
        print (e, dep)