# Hackathon

## Scraping a University Class

In [32]:
# get our department scraper to easily gather course urls
%run ../src/DepartmentScraper.py
%run ../src/departments.py

In [34]:
import pandas as pd

In [35]:
class CourseScraper:
    def __init__(self, links=None):
        self.courses = []
        self.courses_df = pd.DataFrame()
        
        self.course_links = links
        if (self.course_links != None):
            self.scrape_courses(self.course_links)
            
    def format_course(self, course):
        """Dict
        """
        formatted_course = {}
        attrs_keep = {
            'Course',
            'Minimum Fee Hours',
            'Corequisites',
            'Prerequisites',
            'Recommendations',
            'Requirements',
            'Restrictions',
            'Description',
            'Type'
        }
        
        for key in attrs_keep:
            if (key in course.keys()):
                formatted_course[key] = course[key]
            else:
                print (course['Course'], key)
            
        return formatted_course
        
    
    def add_course(self, course):
        """Dict
        """
        self.courses.append(course)
    
    def scrape_courses(self, links):
        """
        """
        for course in links:
            course_details = self.scrape_course(course)
            # format the details
            course_details_clean = self.format_course(course_details)
            
            self.add_course(course_details_clean)
        
        # create dataframe
        self.courses_df = pd.DataFrame(self.courses)
    
    def scrape_course(self, course):
        """
        """
        page = requests.get(course['link'])
        if (page.status_code != 200):
            print ("Error: {}".format(page.status_code))
            return
        
        soup = BeautifulSoup(page.content, 'html.parser')
        
        # isolate the div with course information
        details_div = soup.find("div", {"class": "course-details"})
        
        # create an empty list for the info divs
        details = {
            'Course': course['name'],
            'Type': course['type']
        }
        for div in details_div.find_all("div"):
            # only keep the divs containing info
            if ('aria-describedby' in div.attrs.keys()):
                
                attribute = div.text.strip()
                # info in the col to the right
                right_col = div.find_next_sibling("div")
                info = right_col.text.strip()
                
                details[attribute] = info
        
        return details

In [36]:
math_scraper = DepartmentScraper()
math_scraper.start_scrape(departments['MATH'])

In [37]:
math_classes = CourseScraper(math_scraper.class_pages)

MATH:1120:0AAA Minimum Fee Hours
MATH:1140:0AAA Minimum Fee Hours
MATH:1140:0AAA Description
MATH:1340:0AAA Minimum Fee Hours
MATH:1380:000A Minimum Fee Hours
MATH:1380:000B Minimum Fee Hours
MATH:1440:0AAA Minimum Fee Hours
MATH:1460:000A Minimum Fee Hours
MATH:1460:000B Minimum Fee Hours
MATH:1550:000A Minimum Fee Hours
MATH:1550:000B Minimum Fee Hours
MATH:1560:0AAA Minimum Fee Hours
MATH:1850:0AAA Minimum Fee Hours
MATH:1850:0CCC Minimum Fee Hours
MATH:1850:0DDD Minimum Fee Hours
MATH:1850:0FFF Minimum Fee Hours
MATH:1850:0HHH Minimum Fee Hours
MATH:1850:0JJJ Minimum Fee Hours
MATH:1850:0LLL Minimum Fee Hours
MATH:1860:0AAA Minimum Fee Hours
MATH:1860:0BBB Minimum Fee Hours
MATH:1860:0DDD Minimum Fee Hours
MATH:1860:0FFF Minimum Fee Hours
MATH:3720:0BBB Minimum Fee Hours
MATH:3770:0AAA Minimum Fee Hours
MATH:3770:0CCC Minimum Fee Hours
MATH:4010:0AAA Minimum Fee Hours
MATH:4020:0BBB Minimum Fee Hours
MATH:4020:0BBB Description
MATH:5000:0AAA Minimum Fee Hours
MATH:5200:0AAA Minimum

In [38]:
math_classes.courses_df.head()

Unnamed: 0,Corequisites,Course,Description,Minimum Fee Hours,Prerequisites,Recommendations,Requirements,Restrictions,Type
0,,MATH:0100:0071,This course introduces students to basic algeb...,3,None\n \n \n...,,,Restricted for all students,
1,,MATH:0100:0331,This course introduces students to basic algeb...,3,None\n \n \n...,,,Restricted for all students,
2,,MATH:0100:0332,This course introduces students to basic algeb...,3,None\n \n \n...,,,Restricted for all students,
3,,MATH:1000:0001,How did women mathematicians and scientists he...,1,None\n \n \n...,,first- or second-semester standing,Restricted to new first-year undergraduates,
4,,MATH:1005:0071,This section is taught as a self-paced course ...,4,MATH:0100 with a minimum grade of C- or ALEKS ...,it is strongly recommended that students whose...,,,
