## Target Website URL: https://www.chicagoreader.com/
## Tutorial Reference: http://www.gregreda.com/2013/03/03/web-scraping-101-with-python/
#### NOTE: Instant protocol help: shift + tab

In [49]:
# Import necessary modules
from bs4 import BeautifulSoup
try:
    from urllib.request import urlopen      # For Python 3.x compatibility...In 3.x, the urllib2 module has been split into urllib.request and urllib.error 
except:
    from urllib2 import urlopen             # For Python 2.x compatibility


In [50]:
# Mimic C's MACROS
BASE_URL = "https://www.chicagoreader.com/"                                                               # Launch Page
PAGE_URL = "https://www.chicagoreader.com/chicago/best-of-chicago-2011-food-drink/BestOf?oid=4106228"     # Test Purposes only

In [51]:
def create_soup(section_url):
    '''
        Obj: To eliminate the redundant tasks being done in the remaining core scraping functions. Its an intrinsic function
        Input: The URL of the page (Specific section/Specific category within the section) we want to scrape  
        Output: an instance of the BeautifulSoup class
        Benifit: If in the future you decide to change the parser, you need to make changes only @ one place. 
    '''
    html = urlopen(section_url).read()       # Opens the specified URL [You get to see the "View Page Source" Content]
    #print(html)
    soup = BeautifulSoup(html, "lxml")       # Arg1: Actual Markup    Arg2: The Parser [html.parser/lxml/html5lib].The lxml parser has two versions, an HTML parser and an XML parser.
                                             # The lxml parser is very fast and can be used to quickly parse given HTML
    return soup

In [52]:
def get_category_links(section_url):
    '''
        Obj  : To return the hyperlinks of individual categories. Its an intrinsic function
        Input: section_url: URL of the specific section you are targeting. In this case you are focusing on "Food & Drink"
        Output: A list of all category links, present on the web page of "Food & Drink". 
    '''
    soup = create_soup(section_url) 
    #print(soup)    
    boccat = soup.find("dl", "boccat")       # Return only the first child of this Tag  
    #print(type(boccat))
    category_links = [BASE_URL + dd.a['href'] for dd in boccat.findAll("dd")]    # findAll: Extracts a list of Tag objects that match the given criteria
    #print(category_links)
    return category_links

#Test:
#get_category_links(PAGE_URL)

In [53]:
def get_catgeory_winner_runnerup(specific_category_url):
    '''
        Obj: Fetch the winner and the runner-up details. Its an intrinsic function
        Input: The specific category URL
        Output: A dictionary comprising of Category/Category URL/Winner's Name/Runner Ups
    '''
    soup = create_soup(specific_category_url) 
    category = soup.find("h1", "headline").string              # Get the name of the category in consideration
    #print(category)
    winners = [i.string for i in soup.findAll("h2", "boc1")]   # To get the string part of the anchor tag
    runnerups = [i.string for i in soup.findAll("h2","boc2")]  # To get the string part of the anchor tag
    #print(winners)
    #print(runnerups)
    return {"category":category,
            "category_url":specific_category_url,
            "winners":winners, 
            "runnerups":runnerups}

#Test:    
#get_catgeory_winner_runnerup("https://www.chicagoreader.com//chicago/BestOf?category=1979894&year=2011")

In [66]:
def persist_scraped_data():
    '''
        Obj   : To persist the scraped data in a CSV file
        Input : None
        Output: None [Writes the scraped results to a CSV file]
    '''
    import csv
    allCategoryLinksList = get_category_links(PAGE_URL)       # Fetch all categories within the "Food & Drink" section
    with open("BestChicagoFoods_N_Drinks.csv", "w", newline='\n') as f:
        csvWriter = csv.writer(f, delimiter=',')
        csvWriter.writerow(["Category", "Category URL", "Winners", "Runner Ups"])
        tempStr = ""
        for catLink in allCategoryLinksList:
            tempRes = get_catgeory_winner_runnerup(catLink)
            #print(tempRes)
            #Fetch the individual key:values [As dicts wont maintain the specified order, alternatively use OrderedDict]
            tempCat = tempRes['category'] if tempRes['category'] else '-NA-'
            tempCatURL = tempRes['category_url'] if tempRes['category_url'] else "-NA-"
            tempWin = '|'.join(tempRes['winners']) if tempRes['winners'] else "-NA-"
            tempRun = '|'.join(tempRes['runnerups']) if tempRes['runnerups'] else "-NA-"    
            csvWriter.writerow([tempCat, tempCatURL, tempWin, tempRun])
        

In [67]:
if __name__ == '__main__':
    persist_scraped_data()
    print("Web scraping results persisted!!")
    

Web scraping results persisted!!
