Retrieve the Hugo Awards for Best Novel, Best Novella, Best Novelette, and Best Short Story. 

Retrieved from http://www.thehugoawards.org/

In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd

In [2]:
# Ok, let's get links to the years that awards have been given out.
url = "http://www.thehugoawards.org/hugo-history/"

# Save the html from that page
r = requests.get(url)
text = r.text
soup = BeautifulSoup(text, "html.parser")

# Collect the links
links = []
for link in soup.findAll('a'):
    links.append(link.get('href'))
    
pattern = re.compile('http://www.thehugoawards.org/hugo-history/\d+')
pages = []

for link in links:
    try:
        if bool(pattern.search(link)):
                pages.append(link)
    except:
        pass

pages = sorted(list(set(pages)), reverse=True)

In [3]:
# Write a function to retrieve the finalist entries for a given year and award

def get_hugos(link):
    # Get the html
    r = requests.get(link)
    text = r.text
    pagesoup = BeautifulSoup(text, 'html.parser')

    # Get the year
    year = pagesoup.find_all('title')[0].text[0:4]

    x1 = []
    # Retrieve the information for each type of award
    for award in ['Best Novel', 'Best Novella', 'Best Novelette', 'Best Short Story']:
        # Get the finalist books and authors
        try:
            ul_list = pagesoup.find('strong', text=award).find_next('ul').findAll('li')
            x2 = {}
            x2[award] = []
            for item in ul_list:
                x2[award].append(item.get_text())
        except:
            x2 = {}
        x1.append(x2)
    
    return({year: x1})

In [4]:
def build_finalists(pages):
    # Build a list of the finalists
    finalists = []
    print('.' * len(pages))

    for link in pages:
        time.sleep(1)
        finalists.append(get_hugos(link))
        print('.', end='')

    return(finalists)

In [None]:
finalists = build_finalists(pages)

.....................................................................
........................

In [6]:
pattern1 = re.compile(r' by ')
listoflists = []

for d in finalists:
    for year, awards in d.items():
        for entries in awards:
            for award, itemlist in entries.items():
                for item in itemlist:
                    # Skip the item if the item is "No Award"
                    if item == 'No Award':
                        title = 'No Award'
                        auth = 'None'
                        pub = 'None'

                    else:

                        # Get the publishing information
                        if item[-1] == ')':
                            split1 = item.rsplit(' (', maxsplit=1)
                            pub = split1[-1].replace(')', '')
                        elif item[-1] == ']':
                            split1 = item.rsplit(' [', maxsplit=1)
                            pub = split1[-1].replace(']', '')
                        else:
                            split1 = [item, '']
                            pub = 'no publishing information available'
                        
                        # Get the author and title information
                        if bool(pattern1.search(split1[-2])):
                            split2 = split1[-2].rsplit(' by ')
                            auth = split2[-1]
                            title = split2[0]
                        else:
                            split2 = split1[-2].rsplit(', ')
                            auth = split2[-1]
                            title = split2[0]

                    # Grab the finalist position
                    pos = itemlist.index(item) + 1

                    # Append the information to listoflists
                    listoflists.append([year, award, pos, title, auth, pub])

In [7]:
# Construct a dataframe of the final results
hugos = pd.DataFrame(listoflists, columns=['year', 'award', 'outcome', 'title', 'author', 'published'])
hugos = hugos.drop_duplicates()

hugos.author = [author.lower().replace(r". ", r".").replace(r".", r". ").title() for author in hugos.author]
hugos.title = [title[0:-1] if title[-1] == ',' else title for title in hugos.title ]

# Make the year column numeric
hugos['year'] = pd.to_numeric(hugos['year'])

# Export the dataframe to csv
filename = 'hugos.csv'
hugos.to_csv(filename, index=False)

In [8]:
hugos

Unnamed: 0,year,award,placing,title,author,published
0,2017,Best Novel,1,The Obelisk Gate,N. K. Jemisin,Orbit Books
1,2017,Best Novel,2,All the Birds in the Sky,Charlie Jane Anders,Tor Books / Titan Books
2,2017,Best Novel,3,Ninefox Gambit,Yoon Ha Lee,Solaris Books
3,2017,Best Novel,4,A Closed and Common Orbit,Becky Chambers,Hodder & Stoughton / Harper Voyager US
4,2017,Best Novel,5,Too Like the Lightning,Ada Palmer,Tor Books
5,2017,Best Novel,6,Death’s End,Ken Liu,Tor Books / Head of Zeus
6,2017,Best Novella,1,Every Heart a Doorway,Seanan Mcguire,Tor.com publishing
7,2017,Best Novella,2,The Dream-Quest of Vellitt Boe,Kij Johnson,Tor.com publishing
8,2017,Best Novella,3,Penric and the Shaman,Lois Mcmaster Bujold,Spectrum Literary Agency
9,2017,Best Novella,4,The Ballad of Black Tom,Victor Lavalle,Tor.com publishing
