In [44]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
## The downloader for Data Processing in Python midterm

In [90]:
class MidtermDownloader:    
    '''
        Class for downloading the characters and books for midterm purpose.
    '''
    def __init__(self, url):
        
        # Running the functions
        self.url = url
        self.soup = self.getSoup(self.url)
        self.links = self.getLinks(self.soup)
        self.characters = [self.getChars(link) for link in links]
        
        # Saving the results to Pandas DataFrame
        self.dataframe = self.saveToPandas(self.characters)
  

    def getSoup(self, url):
        '''
            Getting the soup from BeautifulSoup.
        '''
        
        r = requests.get(url)
        r.encoding = 'UTF-8'
        return BeautifulSoup(r.text,'lxml')

    
    def getLinks(self, soup):
        '''
            Getting the links for wanted characters.
        '''
        
        lis = soup.find('span',{'id':'Unseen_University_and_the_Wizards'}).parent.previous.previous.parent.parent.parent.find_all('li')

        links = []
        for l in lis:
            link = "http://wiki.lspace.org" + l.find('a')['href']
            links.append(link)

        return links
    
    
    def getChars(self, link):
        '''
            Getting the name and books for a given character (link).
        '''
        
        soup = getSoup(link)
        name = soup.find(id='firstHeading').text

        books = [i.text for i in soup.select("td a[href*='Book:']")]

        chars = {
            'Name': name,
            'Books': books
        }

        return chars
   

    def saveToPandas(self, chars):
        '''
            Saving all data to Pandas DataFrame.
        '''
        
        rows = [] 
        for data in chars: 
            books = data['Books'] 
            name = data['Name'] 

            for book in books:
                row = {'character_name':name, 'book':book}
                rows.append(row)

        return pd.DataFrame(rows)
    
    
    
# Dowloading according to the midterm assignment  
url = "http://wiki.lspace.org/mediawiki/List_of_Pratchett_characters"
md = MidtermDownloader(url)
md.dataframe

Unnamed: 0,book,character_name
0,Reaper Man,Evadne Cake
1,Men at Arms,Evadne Cake
2,Jingo,Evadne Cake
3,Going Postal,Evadne Cake
4,Making Money,Evadne Cake
5,Guards! Guards!,Fred Colon
6,Men at Arms,Fred Colon
7,Soul Music,Fred Colon
8,Feet of Clay,Fred Colon
9,Jingo,Fred Colon


In [93]:
df = md.dataframe

## Task 3a
book = df['book'] == "Men at Arms"
df[book].count()
# --> Answer: 14

book              14
character_name    14
dtype: int64

In [94]:
## Task 3b
chars = (df['character_name'] == "Visit-The-Infidel-With-Explanatory-Pamphlets") | (df['character_name'] == "Evadne Cake")
df[chars].groupby(['book']).count()
# --> Answer: 1 (Jingo)

Unnamed: 0_level_0,character_name
book,Unnamed: 1_level_1
Feet of Clay,1
Going Postal,1
Hogfather,1
Jingo,2
Making Money,1
Men at Arms,1
Night Watch,1
Reaper Man,1
The Fifth Elephant,1
Thud!,1
