# Yeah, the Hugo Awards.

I think I've consistently liked these books. I'm about to need another one. Will one of these books be the book of my dreams? What books that I've already read have won a Hugo and I didn't EVEN KNOW IT!?!?!!!

Are there certain authors who show up more than others? 

What else can we figure out about the books that receive awards?

In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import re
import functools
import os
import time
import pickle
import numpy as np
import pandas as pd

In [2]:
# Ok, let's get links to the years that awards have been given out.
url = "http://www.thehugoawards.org/hugo-history/"

# Save the html from that page
r = requests.get(url)
text = r.text
soup = BeautifulSoup(text, "html.parser")

# Collect the links
links = []
for link in soup.findAll('a'):
    links.append(link.get('href'))
    
pattern = re.compile('http://www.thehugoawards.org/hugo-history/\d+')
pages = []

for link in links:
    try:
        if bool(pattern.search(link)):
                pages.append(link)
    except:
        pass

pages = sorted(list(set(pages)), reverse=True)

Unfortunately, the Hugo Awards website doesn't have consistent formatting across all the pages, so the extracting of the finalist novels for each year isn't completely straightforward. Most of the pages follow a particular pattern; I make exceptions for the rest.

In [3]:
# Write a function to retrieve the finalist novels for a given year

def get_novels(link):
    # Get the html
    r = requests.get(link)
    text = r.text
    pagesoup = BeautifulSoup(text, 'html.parser')

    # Get the year
    year = pagesoup.find_all('title')[0].text[0:4]

    # Get the finalist books and authors        
    pagetext = pagesoup.get_text()

    if year == '2004':
        # In 2004 (entry #13), we have to adjust the block that's retrieved, 
        # since my locating term is used in an unexpected place in the page text
        novels = pagetext.split('Best ')[2]
    elif year == '2017':
        # In 2017 (entry #0), something similar happens. I can also adjust this easily. 
        # If this happened more often, I would adjust my strategy; for only two unusual cases, I'll leave it as-is.
        novels = pagetext.split('Best ')[3]
    else:
        # It turns out that Hugos were only given to periodicals in 1957, 
        # so no novels are expected in big_list. This accounts for entry #60. The other years all work.
        novels = pagetext.split('Best ')[1]
    
    novels = novels.split('\n')
    
    # Collect the books into a list to return
    books = []
    pattern1 = re.compile(r'[A-Z]')
    pattern2 = re.compile(r'(, )|( by )')
    pattern3 = re.compile(r'(\(.*\))|(\[.*\])')
    
    for novel in novels:
        try:
            if bool(pattern2.search(novel)) & bool(pattern3.search(novel)):
                if not bool(re.compile(r'^Novel').match(novel)):
                    books.append(novel)
        except:
            pass

    return({year: books})

In [4]:
# Check to see whether hugofinalists.p already exists
if os.path.exists('hugofinalists.p'):
    # If it does, go ahead and load it into finalists
    with open('hugofinalists.p', 'rb') as f:
        finalists = pickle.load(f)

# If it doesn't, build a list of the finalists and dump it to the file for next time
else:
    finalists = []
    for link in pages:
        time.sleep(1)
        finalists.append(get_novels(link))
            
    with open('hugofinalists.p', 'wb') as f:
        pickle.dump(finalists, f)

In [5]:
# Split the novel information into bits: title, author, publishing information
pattern1 = re.compile(r', by ')
pattern2 = re.compile(r' by ')
pattern3 = re.compile(r', ')
pattern4 = re.compile(r' \[')
pattern5 = re.compile(r' \(')
listoflists = []

for d in finalists:
    for key, value in d.items():
        for item in value:
            if bool(pattern1.search(item)):
                split1 = pattern1.split(item, maxsplit=1)
            elif bool(pattern2.search(item)):
                split1 = pattern2.split(item, maxsplit=1)
            elif bool(pattern3.search(item)):
                split1 = pattern3.split(item, maxsplit=1)
            if bool(pattern4.search(split1[1])):
                split2 = split1[1].rsplit(r' [', maxsplit=1)
            elif bool(pattern5.search(split1[1])):
                split2 = split1[1].rsplit(r' (', maxsplit=1)
            listoflists.append([key, split1[0], split2[0], split2[1].replace(')', '').replace(']', '')])

In [6]:
# Construct a dataframe of the final results
hugos = pd.DataFrame(listoflists, columns=['year', 'title', 'author', 'published'])

# Some of the names can be cleaned up a little; make formatting (mostly) consistent
hugos.author = [author.lower().replace(r". ", r".").replace(r".", r". ").title() for author in hugos.author]
hugos.author[20] = 'Larry Correia'
hugos.author[5] = 'Cixin Liu, translated by Ken Liu'
hugos.author[11] = 'Cixin Liu, translated by Ken Liu'
hugos.author[351] = 'Edward E. Smith'
hugos.author[357] = 'Edward E. Smith'

# Make the year column numeric
hugos['year'] = pd.to_numeric(hugos['year'])

In [7]:
# Export the dataframe to csv
hugos.to_csv('hugos.csv', index=False)