# Hackathon

## Scrape Departments 

Scrape departments for a list of URLs that point to classes for further scraping.

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
class_urls = {
    "psych": "https://myui.uiowa.edu/my-ui/courses/by-department.page?q.academicUnitId=558&showResults=1",
    "bio": "https://myui.uiowa.edu/my-ui/courses/by-department.page?q.academicUnitId=537&showResults=1",
    "swahili": "https://myui.uiowa.edu/my-ui/courses/by-department.page?q.academicUnitId=674&showResults=1"
}

class_id = {
    "psych": 558,
    "bio": 537,
    "swahili": 674
}

**Explore the structure of the page.**

1. We need to see how each class is displayed within their cell
2. We need to paginate

In [None]:
def get_page_rows(soup):
    """
    """
    table = soup.find(id='search-result')
    table_body = table.find_next('tbody')
    table_rows = table_body.find_all('tr')
            
    return table_rows

In [None]:
def get_row_info(row):
    """
    """
    # each td has a separate piece of info
    cols = row.find_all('td')
    
    # but the top is the most important
    row_title = cols[0]
    
    # <a>, has multiple info for us
    row_a = row_title.find("a", { "class" : "text-underline" })
    course_name = row_a.text
    course_link = row_a.attrs['href']
    
    # Type of class
    try:
        course_type = row_title.find_next("em").text
    except:
        course_type = row.find_next("em").text
    
    return [course_name, course_link, course_type]

In [None]:
def scrape_department(url):
    """
    """
    # Request and check page
    page = requests.get(url)
    if (page.status_code != 200):
        print ("Couldn't request page, Err: {}".format(page.status_code))
        
    # Create soup for our functions
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # get the raw rows from page
    rows = get_page_rows(soup)
    
    # format each row to get the class url and type
    rows_info = [get_row_info(row) for row in rows]
    
    # some pagination will have to go here but thats for later
    
    return rows_info

In [None]:
rows = scrape_department(class_urls['psych'])

In [114]:
class Scraper:
    def __init__(self):
        """
        """
        self.academicUnitID = None
        self.search_pages = []
        self.class_pages = []
        
    def create_search_page(self, page_num):
        """
        """
        search = "https://myui.uiowa.edu/my-ui/courses/by-department.page?page={}&q.academicUnitId={}&showResults=1".format(page_num, self.academicUnitID)
        return search
    
    def enqueue_search_page(self, url):
        """
        """
        self.search_pages.append(url)
    
    def get_search_pages(self, soup):
        """
        """
        
        # Every department has at least one class
        first_page = self.create_search_page(1)
        self.enqueue_search_page(first_page)
    
        # check if it has a pagination tag
        paging = soup.find("ul", {"class" : "pagination"})
        if (paging == None):
            # we're good!
            return
            
        # extract only the links
        page_links = paging.find_all('a')
        
        # These are the buttons
        for i in page_links:
            if ('title' in i.attrs):
                i_title = i.attrs['title']
                # only get buttons we need
                if (i_title.startswith('Go to page ')):
                    i_url = self.create_search_page(i.text)
                    self.enqueue_search_page(i_url)
        
        return
    

    def get_page_rows(self, soup):
        """
        """
        table = soup.find(id='search-result')
        table_body = table.find_next('tbody')
        table_rows = table_body.find_all('tr')

        return table_rows
    
    def get_row_info(self, row):
        """
        """
        # each td has a separate piece of info
        cols = row.find_all('td')

        # but the top is the most important
        row_title = cols[0]

        # <a>, has multiple info for us
        row_a = row_title.find("a", { "class" : "text-underline" })
        course_name = row_a.text
        course_link = 'https://myui.uiowa.edu' + row_a.attrs['href']

        # Type of class
        try:
            course_type = row_title.find_next("em").text
        except:
            course_type = row.find_next("em").text
            
        course_info = {
            'name': course_name,
            'link': course_link,
            'type': course_type
        }
        
        self.class_pages.append(course_info)

        return 
    
    def scrape_page(self, url):
        # Request and check page
        page = requests.get(url)
        if (page.status_code != 200):
            print ("Couldn't request page, Err: {}".format(page.status_code))

        # Create soup for our functions
        soup = BeautifulSoup(page.content, 'html.parser')

        # get the raw course rows from page
        rows = self.get_page_rows(soup)

        # format each row to get the class url and type
        for row in rows:
            # this will format and add it
            self.get_row_info(row)

        return 
        
    
    def start_scrape(self, academicUnitID):
        """
        """
        # set college
        self.academicUnitID = academicUnitID
        
        # Request and check page
        page_url = self.create_search_page(1)
        page = requests.get(page_url)
        
        if (page.status_code != 200):
            print ("Couldn't request page, Err: {}".format(page.status_code))

        # Create soup for our traversal
        soup = BeautifulSoup(page.content, 'html.parser')
        
        # Instantiate scraping queue
        self.get_search_pages(soup)
        
        # now that self has a list of pages
        for page in self.search_pages:
            # scrape each page
            self.scrape_page(page)
        
        
        return 
        
        

In [115]:
scrape = Scraper()

In [116]:
scrape.start_scrape(class_id['psych'])

In [117]:
scrape.search_pages

['https://myui.uiowa.edu/my-ui/courses/by-department.page?page=1&q.academicUnitId=558&showResults=1',
 'https://myui.uiowa.edu/my-ui/courses/by-department.page?page=2&q.academicUnitId=558&showResults=1']

In [118]:
scrape.class_pages

[{'link': 'https://myui.uiowa.edu/my-ui/courses/details.page?_ticket=SfX5E-nyonxsYgY9Kg_1wyeYtXpT9Iue&id=831031&ci=147710',
  'name': 'PSY:1001:0AAA',
  'type': 'Lecture'},
 {'link': 'https://myui.uiowa.edu/my-ui/courses/details.page?_ticket=ExMsVvBshZjqRM_fM6pCHlGWPxZB1Cdo&id=831032&ci=147710',
  'name': 'PSY:1001:0A01',
  'type': 'Discussion'},
 {'link': 'https://myui.uiowa.edu/my-ui/courses/details.page?_ticket=uo_ZsjXillVYNus4BQt1QhTTpbfGzDeP&id=831033&ci=147710',
  'name': 'PSY:1001:0A02',
  'type': 'Discussion'},
 {'link': 'https://myui.uiowa.edu/my-ui/courses/details.page?_ticket=veCJwyerdUtYNus4BQt1QieYtXpT9Iue&id=831034&ci=147710',
  'name': 'PSY:1001:0A03',
  'type': 'Discussion'},
 {'link': 'https://myui.uiowa.edu/my-ui/courses/details.page?_ticket=7bmzdNQFqw_F5uOryZHTKlGWPxZB1Cdo&id=831035&ci=147710',
  'name': 'PSY:1001:0A04',
  'type': 'Discussion'},
 {'link': 'https://myui.uiowa.edu/my-ui/courses/details.page?_ticket=ouFOu4a-7cjF5uOryZHTKvxrZxWWA5Tu&id=831036&ci=147710',

In [92]:
swahili_scrape = Scraper()
swahili_scrape.start_scrape(674)

In [93]:
swahili_scrape.search_pages

['https://myui.uiowa.edu/my-ui/courses/by-department.page?page=1&q.academicUnitId=674&showResults=1']

In [94]:
swahili_scrape.class_pages

[]