# Novel dataset web scraper

Gathers novel information from novelupdates, i.e. http://www.novelupdates.com/.  
The data is then cleaned and arranged into a dataset.  
The dataset is finally saved as a csv file.

In [1]:
import requests
import time
import re
import pandas as pd
from bs4 import BeautifulSoup

# Display all columns when showing dataframes
pd.set_option('display.max_columns', None)

novel_list_page = "http://www.novelupdates.com/novelslisting/?st=1&pg="
novel_page = "http://www.novelupdates.com/?p="

There do not seem to be an easy way to get all novel ids. These ids do not seem to be necessarily be strictly consecutive or increasing. Hence a brute force method is used to gather the ids of all current novels.

The ids are gathered from a list of all the novels. 
The list contains mulitple pages/tabs with each page consisting of 25 novels.
First the number of novel pages is retrieved and then the pages are iterated though in order to scrape the ids of the novels on each one.

In [7]:
def get_novel_list_num_pages(page):
    """
    Get the maximum number of pages with novels.
    This number is not contant since the number of novels on the website are increasing.
    Following the current website layout each page have 25 novels.
    
    :param page: The web address to the novel list, presumably the first page but can be any.
    :returns: An int representing the current number of pages of the novel lists.
    """
    soup = BeautifulSoup(page.content, 'html.parser')
    dig_pag = soup.find('div', attrs={'class':'digg_pagination'})
    max_page = max([int(a.text) for a in dig_pag.find_all('a') if a.text.isdigit()])
    return max_page

# Get all novel ids from a single page
def get_novel_ids(page):
    """
    Gets all the novel ids from a page.
    
    :param page: One of the pages with novels.
    :returns: A list with all novel ids for the novels on the page.
    """
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('div', attrs={'class':'w-blog-content other'})
    novels = table.find_all('div', attrs={'class': 'search_title'})
    novel_ids = [novel.find('span', attrs={'class': 'rl_icons_en'}).get('id')[3:] for novel in novels]
    novel_ids = [int(n) for n in novel_ids]
    return novel_ids


page = requests.get(novel_list_page + '1')

# TOOD: For testing - only use 2 pages for now.
novels_num_pages = 1
#novels_max_pages = get_novel_list_num_pages(page)
print("Pages with novels: " + str(novels_num_pages))

all_novel_ids = []
for i in range(1, novels_num_pages+1):
    page = requests.get(novel_list_page + str(i))
    novel_ids = get_novel_ids(page)
    all_novel_ids.extend(novel_ids)
    time.sleep(1)

df = pd.DataFrame(all_novel_ids, columns=['id'])

Pages with novels: 1


In [8]:
def get_value(element, check=lambda e: e.string, parse=lambda e: e.string.strip()):
    """
    Gets the value of a HTML element/node following the parse function. 
    This function is necessary since the novel pages are not always consistent with each other. 
    Also checks if the value is 'N/A' and returns None in that case.
    
    :param element: A HTML element/node.
    :param check: A function to be applied on the element. 
                  Checks if the element object have a retrun value for the function or is it's None.
    :param parse: A function to parse the element if it passes the check.
    :returns: The value returned by running the parse function on the element.
              None is returned if the element does not pass the check function or if the value is 'N/A'.
    """
    if check(element) is None:
        return None
    pe = parse(element)
    if ''.join(pe) == 'N/A':
        return None
    return pe

              
def get_value_str_txt(element, check_one=lambda e: e.string, parse_one=lambda e: e.string.strip(),
                      check_two=lambda e: e.text, parse_two=lambda e: e.text.strip()):
    """
    Used when it's unknown which function to apply on an element to obtain it's value.
    For example, if .string or .text should be used.
    The functions are applied in order, if the first one returns None then the second one is tried.
    
    :param element: A HTML element/node.
    :param check_one: A function to be applied on the element.
                      Checks if the element object have a retrun value for the function or is it's None.
    :param parse_one: A function to parse the element if it passes the check.
    :param check_two: A function to be applied on the element.
                      Checks if the element object have a retrun value for the function or is it's None.
    :param parse_two: A function to parse the element if it passes the check.
    :returns: The value returned by running parse_one or parse_two on the element.
    """
    res_one = get_value(element, check_one, parse_one)
    res_two = get_value(element, check_two, parse_two)
    return res_one or res_two
              

def empty(element):
    """
    Checks if running .string on the element returns an empty string.
    
    :param element: A HTML element/node.
    :returns: A boolean representing whether the element contains an empty string.
    """
    return get_value(element) == ""


def get_bool(string):
    """
    Convinience function to convert a string to a boolean.
    Handles Yes, yes, No and no.
    
    :param string: String to convert to boolean.
    :retruns: The boolean representation of the string or None.
    """
    if string is None:
        return None
    
    if string.lower() == "yes":
        return True
    elif string.lower() == "no":
        return False
    else:
        return None

In [15]:
def general_info(content):
    """
    Scrapes all general information of a specific novel.
    
    :param content: The content page of a novel.
    :returns: A dictionary with scraped and cleaned information.
    """
    
    gen_info = {}
    gen_info['name'] = get_value(content.find('div', attrs={'class', 'seriestitlenu'}))
    gen_info['assoc_names'] = get_value(content.find('div', attrs={'id': 'editassociated'}), 
                                        check=lambda e: e, parse=lambda e: list(e.stripped_strings))
    gen_info['original_langauge'] = get_value(content.find('div', attrs={'id': 'showlang'}), 
                                          lambda e: e.a, 
                                          lambda e: e.text.strip().lower())
    gen_info['authors'] = [author.text.lower()
                for author in content
                  .find('div', attrs={'id': 'showauthors'})
                  .find_all('a')]
    gen_info['genres'] = [genre.text.lower()
                for genre in content
                  .find('div', attrs={'id': 'seriesgenre'})
                  .find_all('a', attrs={'class': 'genre'})]
    gen_info['tags'] = [tag.text.lower()
                for tag in content
                  .find('div', attrs={'id': 'showtags'})
                  .find_all('a')]
    return gen_info


def publisher_info(content):
    """
    Scrapes all publisher information of a specific novel.
    
    :param content: The content page of a novel.
    :returns: A dictionary with scraped and cleaned information.
    """
    pub_info = {}
    pub_info['start_year'] = get_value(content.find('div', attrs={'id': 'edityear'}),)
    pub_info['licensed'] = get_bool(get_value(content.find('div', attrs={'id': 'showlicensed'})))
    pub_info['original_publisher'] = get_value(content.find('div', attrs={'id': 'showopublisher'}),
                                               lambda e: e.a, 
                                               lambda e: e.a.string.strip().lower())
    pub_info['english_publisher'] = get_value(content.find('div', attrs={'id': 'showepublisher'}),
                                              lambda e: e.a, 
                                              lambda e: e.a.string.strip().lower())
    return pub_info


def chapter_info(soup, content):
    """
    Scrapes all chapter information of a specific novel. 
    Both latest released chapters and if the novel is complete.
    
    :param content: The content page of a novel.
    :returns: A dictionary with scraped and cleaned information.
    """
    chap_info = {}
    chapter_status = get_value_str_txt(content.find('div', attrs={'id': 'editstatus'}))
    
    print(chapter_status)
    
    if chapter_status is not None:
        chap_info['complete_original'] = 'complete' in chapter_status.lower()
        chapter_current = re.search('(\d+)[ wnl]*(?=chap)', chapter_status.lower())
        if chapter_current is not None:
            chapter_current = chapter_current.group(1).strip() + " chapters"
        else:    
            # Check if volume
            chapter_current = re.search('(\d+)[ wnl]*(?=volu)', chapter_status.lower())
            if chapter_current is not None:
                chapter_current = chapter_current.group(1).strip() + " volumes"
            else:
                # Get the first number
                chapter_current = re.search('(\d+)', chapter_status.lower())
                if chapter_current is not None:
                    chapter_current = chapter_current.group(1).strip()        
        
        chap_info['chapters_original_current'] = chapter_current if chapter_current != "" else None 
    chap_info['complete_translated'] = get_bool(get_value(content.find('div', attrs={'id': 'showtranslated'})))
    
    table = soup.find('table', attrs={'id': 'myTable'})
    if table is not None:
        release_table = table.find('tbody')
        chap_info['chapter_latest_translated'] = release_table.find('tr').find_all('td')[2].a.string.strip()
    return chap_info
    
    
def release_info(content):
    """
    Scrapes all release and activity information of a specific novel.
    
    :param content: The content page of a novel.
    :returns: A dictionary with scraped and cleaned information.
    """
    rel_info = {}
    release_freq = content.find('h5', attrs={'class': 'seriesother'}, string='Release Frequency').next_sibling
    activity = content.find_all('span', attrs={'class': 'userrate rank'})
    
    if not empty(release_freq):
        rel_info['release_freq'] = float(re.search('\d+\.?\d*', release_freq).group(0))
        
    rel_info['activity_week_rank'] = int(activity[0].string[1:])
    rel_info['activity_month_rank'] = int(activity[1].string[1:])
    rel_info['activity_all_time_rank'] = int(activity[2].string[1:])
    return rel_info
    

def community_info(content):
    """
    Scrapes all community information of a specific novels.
    
    :param content: The content page of a novel.
    :returns: A dictionary with scraped and cleaned information.
    """
    comm_info = {}
    activity = content.find_all('span', attrs={'class': 'userrate rank'})
    comm_info['on_reading_lists'] = int(content.find('b', attrs={'class': 'rlist'}).string)
    comm_info['reading_list_month_rank'] = int(activity[3].string[1:])
    comm_info['reading_list_all_time_rank'] = int(activity[4].string[1:])
    
    # rating
    rating_text = content.find('span', attrs={'class': 'uvotes'}).text.split(' ')
    comm_info['rating'] = float(rating_text[0][1:])
    comm_info['rating_votes'] = int(rating_text[3])
    return comm_info
    
    
def relation_info(soup, content):
    """
    Scrapes all relational information of a specific novel.
    
    :param content: The content page of a novel.
    :returns: A dictionary with scraped and cleaned information.
    """
    rel_info = {}
    any_related = content.find('h5', attrs={'class': 'seriesother'}, string='Related Series').next_sibling
    if "N/A" not in any_related:
        related_series_first = any_related.next_sibling.get('id')[3:]
        rel_info['related_series_ids'] = [related_series_first]

    rel_info['recommended_series_ids'] = []    
    for series in soup.find_all('a', attrs={'class': 'genre'}, recursive=False):
        if series.has_attr('title'):
            rel_info['recommended_series_ids'].append(series.get('id')[3:])
        else:
            rel_info['related_series_ids'].append(series.get('id')[3:])
    return rel_info

In [16]:
def parse_novel_page(novel_id):
    """
    Parses and scrapes information from a single novel page.
    
    :param novel_id: The id number of the novel.
    :returns: A pandas series with all scraped and cleaned information about the novel.
    """
    
    page = requests.get(novel_page + str(novel_id))    
    soup = BeautifulSoup(page.content, 'lxml')
    content = soup.find('div', attrs={'class': 'w-blog-content'})
    if content is None:
        return pd.Series() 
    data = {'id': novel_id}
    
    #TODO
    print(novel_id)
    
    data.update(general_info(content))
    data.update(publisher_info(content))
    data.update(chapter_info(soup, content))
    data.update(release_info(content))
    data.update(community_info(content))
    data.update(relation_info(soup, content))
    
    time.sleep(1)
    return pd.Series(data)

df = df['id'].apply(lambda novel_id: parse_novel_page(novel_id))
df = df.set_index('id')
print(df.head)
#df.to_csv('novels.csv', header=True, index=False)

2151
54 Chapters (Complete)
11115
2100 Chapters (completed)
14233
2563 chapters (Completed)
14
15 Volumes / 1592 Chapters (Ongoing)
10802
1512 Chapters (Completed)
15734
5 Volumes (Ongoing)
25241
None
14531
1638 Chapters (Complete)
24693
1736 chapters
5023
7500+ Chapters (Ongoing)
20789
223 chapters completed
13111
1535 Chapters (Complete)
10776
1162 Chapters (Completed)
1329
11 Volumes (Complete)
14562
1252 Chapter (Complete)
11582
1158 Chapters (Completed)
6363
3799 Chapters (Ongoing)
24100
Completed
62 chapter
18679
3526 Chapters (Ongoing)
15752
2090 chapters (Completed)
25301
Complete
24175
None
10432
245 chapters + Epilogue + Extra stories (Complete)
22848
732 Chapters (Completed)
13615
124 chapters (Dropped)
<bound method NDFrame.head of        activity_all_time_rank  activity_month_rank  activity_week_rank  \
id                                                                       
2151                     2077                  986                1049   
11115                   