In [195]:
from requests import get
from bs4 import BeautifulSoup
from pandas import DataFrame, merge

In [200]:
class Downloader:
    '''
    Download manager class for characters.
    It contains methods for collecting character links and data.
    '''
    
    def __init__(self):
        self.link = 'http://wiki.lspace.org/mediawiki/List_of_Pratchett_characters'
        pass
    
    def get_links_to_characters(self): 
        '''
        Parse the links to the characters, saves them in self.links and also returns them
        '''   
        response = get(self.link)
        response.encoding = 'UTF-8'
        
        soup = BeautifulSoup(response.text, 'html')
        lis = soup.find('span',{'id':'Unseen_University_and_the_Wizards'}).parent.previous.previous.parent.parent.parent.find_all('li')

        self.links = ['http://wiki.lspace.org' + li.find('a')['href'] for li in lis]
       
        return self.links
    
    def get_character_data(self):
        '''
        Parse character pages.
        Creates DataFrame linking the particular character with the book he is appearing in. 
        '''   
        self.data = DataFrame(columns=['book', 'character_name'])
        
        for link in self.links:
            characterPage = CharacterPage(link)
            name = characterPage.get_character_name()
            books = characterPage.get_character_books()
            
            for book in books:
                self.data = self.data.append({
                'book': book,
                'character_name': name}, ignore_index = True)
        
        return self.data

In [204]:
class CharacterPage:
    '''
    Class containing methods for parsing a character page.
    '''
    
    def __init__(self, link):
        self.link = link
        response = get(link)
        response.encoding = 'UTF-8'
        
        self.soup = BeautifulSoup(response.text, 'html')
    
    def get_character_name(self):
        '''
        Parse the name of the character from the character page.
        '''
        self.name = self.soup.find('h1', { 'id': 'firstHeading'}).text
        
        return self.name
    
    def get_character_books(self):
        '''
        Parse the books which the character is appearing in.
        '''  
            
        self.books = []
        
        table = self.soup.find('table', {"class": 'toccolours'})
        list_of_links = table.find_all('a')
        for link in list_of_links:
            if link.get('title') and link.get('title').find("Book:") != -1:
                self.books.append(link.text)
        
        return self.books

In [205]:
downloader = Downloader()

# Exercise 1
downloader.get_links_to_characters()

# Exercise 2
df = downloader.get_character_data()

In [206]:
# Exercise 3
# a. How many characters is reported in the book Men at Arms?
count = len(df[df["book"] == "Men at Arms"])
print('In book Men at Arms {} character(s) are reported.'.format(count))

# b. In how many books appear both Visit-The-Infidel-With-Explanatory-Pamphlets and Evadne Cake?
evadne = df[df["character_name"] == "Evadne Cake"]
visit = df[df["character_name"] == "Visit-The-Infidel-With-Explanatory-Pamphlets"]

count = len(merge(evadne, visit, how='inner', on=['book']))

print('In {} book(s) appear both Visit-The-Infidel-With-Explanatory-Pamphlets and Evadne Cake.'.format(count))

In book Men at Arms 14 character(s) are reported.
In 1 book(s) appear both Visit-The-Infidel-With-Explanatory-Pamphlets and Evadne Cake.
